diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index bdc8e26b14..10f52af4ef 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -193,6 +193,8 @@ image_prometheus: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/prometh
 image_prometheus_config_reloader: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/prometheus-config-reloader:master
 image_prometheus_operator: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/prometheus-operator:master
 image_prometheus_statsd_exporter: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/prometheus-statsd-exporter:master
+image_prometheus_dcgm_exporter: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/prometheus-dcgm-exporter:master
+image_prometheus_kube_state_metrics_exporter: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/prometheus-kube-state-metrics-exporter:master
 image_prometheus_node_exporter: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/prometheus-node-exporter:master
 image_kube_rbac_proxy: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/kube-rbac-proxy:master
 image_grafana: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/grafana:master
@@ -224,6 +226,8 @@ image_prometheus: gcr.io/<project_id>/cortexlabs/prometheus:master
 image_prometheus_config_reloader: gcr.io/<project_id>/cortexlabs/prometheus-config-reloader:master
 image_prometheus_operator: gcr.io/<project_id>/cortexlabs/prometheus-operator:master
 image_prometheus_statsd_exporter: gcr.io/<project_id>/cortexlabs/prometheus-statsd-exporter:master
+image_prometheus_dcgm_exporter: gcr.io/<project_id>/cortexlabs/prometheus-dcgm-exporter:master
+image_prometheus_kube_state_metrics_exporter: gcr.io/<project_id>/cortexlabs/prometheus-kube-state-metrics-exporter:master
 image_prometheus_node_exporter: gcr.io/<project_id>/cortexlabs/prometheus-node-exporter:master
 image_kube_rbac_proxy: gcr.io/<project_id>/cortexlabs/kube-rbac-proxy:master
 image_grafana: gcr.io/<project_id>/cortexlabs/grafana:master
diff --git a/build/images.sh b/build/images.sh
index 95cbd7fc3e..6b3742e711 100644
--- a/build/images.sh
+++ b/build/images.sh
@@ -58,6 +58,8 @@ non_dev_images_cluster=(
   "prometheus-config-reloader"
   "prometheus-operator"
   "prometheus-statsd-exporter"
+  "prometheus-dcgm-exporter"
+  "prometheus-kube-state-metrics"
   "prometheus-node-exporter"
   "kube-rbac-proxy"
   "grafana"
diff --git a/charts/dashboards/batch.json b/charts/dashboards/batch.json
index a3ed008a91..565107bc73 100644
--- a/charts/dashboards/batch.json
+++ b/charts/dashboards/batch.json
@@ -15,6 +15,7 @@
   "editable": true,
   "gnetId": null,
   "graphTooltip": 0,
+  "iteration": 1614622843373,
   "links": [],
   "panels": [
     {
@@ -36,13 +37,25 @@
         "content": "<h1 style=\"text-align: center\">BatchAPI</h1>\n",
         "mode": "markdown"
       },
-      "pluginVersion": "7.4.0",
+      "pluginVersion": "7.4.2",
       "timeFrom": null,
       "timeShift": null,
       "title": "",
       "transparent": true,
       "type": "text"
     },
+    {
+      "datasource": null,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 2
+      },
+      "id": 22,
+      "title": "API Stats",
+      "type": "row"
+    },
     {
       "aliasColors": {},
       "bars": false,
@@ -62,7 +75,7 @@
         "h": 9,
         "w": 12,
         "x": 0,
-        "y": 2
+        "y": 3
       },
       "hiddenSeries": false,
       "id": 2,
@@ -83,7 +96,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0",
+      "pluginVersion": "7.4.2",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -162,7 +175,7 @@
         "h": 9,
         "w": 12,
         "x": 12,
-        "y": 2
+        "y": 3
       },
       "hiddenSeries": false,
       "id": 3,
@@ -183,7 +196,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0",
+      "pluginVersion": "7.4.2",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -219,6 +232,7 @@
       },
       "yaxes": [
         {
+          "$$hashKey": "object:262",
           "decimals": 0,
           "format": "short",
           "label": null,
@@ -228,6 +242,7 @@
           "show": true
         },
         {
+          "$$hashKey": "object:263",
           "format": "short",
           "label": null,
           "logBase": 1,
@@ -259,8 +274,8 @@
       "gridPos": {
         "h": 8,
         "w": 12,
-        "x": 6,
-        "y": 11
+        "x": 0,
+        "y": 12
       },
       "hiddenSeries": false,
       "id": 5,
@@ -281,7 +296,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0",
+      "pluginVersion": "7.4.2",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -337,6 +352,1078 @@
         "align": false,
         "alignLevel": null
       }
+    },
+    {
+      "aliasColors": {
+        "Active Jobs": "semi-dark-green",
+        "Active Workers": "semi-dark-orange"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Active jobs/workers",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 12
+      },
+      "hiddenSeries": false,
+      "id": 20,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "count(kube_job_status_active{job_name=~\"$api_name.+\"})",
+          "interval": "",
+          "legendFormat": "Active Jobs",
+          "refId": "Active Batches"
+        },
+        {
+          "expr": "sum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Active Workers",
+          "refId": "Active Workers"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "# Active Jobs/Workers",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:128",
+          "decimals": 0,
+          "format": "count",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:129",
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "datasource": null,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 20
+      },
+      "id": 11,
+      "title": "Aggregate Worker Usage",
+      "type": "row"
+    },
+    {
+      "aliasColors": {
+        "Total CPU Request": "semi-dark-orange",
+        "Total CPU Usage": "semi-dark-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Total CPU usage across all workers of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 21
+      },
+      "hiddenSeries": false,
+      "id": 13,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "exemplar": false,
+          "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))",
+          "format": "time_series",
+          "instant": false,
+          "interval": "",
+          "legendFormat": "Total CPU Usage",
+          "refId": "CPU Usage"
+        },
+        {
+          "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"$api_name.+\"})",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Total CPU Request",
+          "refId": "CPU Request"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Total CPU Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "core",
+          "label": "cpu",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {
+        "Total Memory Request": "semi-dark-orange",
+        "Total Memory Usage": "semi-dark-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Total memory usage across all workers of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 21
+      },
+      "hiddenSeries": false,
+      "id": 15,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "exemplar": false,
+          "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
+          "format": "time_series",
+          "instant": false,
+          "interval": "",
+          "legendFormat": "Total Memory Usage",
+          "refId": "Memory Usage"
+        },
+        {
+          "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"$api_name.+\"}) / 1024^2",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Total Memory Request",
+          "refId": "Memory Request"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Total Memory Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "MiB",
+          "label": "memory",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {
+        "Total GPU Capacity": "semi-dark-orange",
+        "Total GPU Usage": "semi-dark-green",
+        "Total GPU Utilization": "light-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Total GPU core usage across all workers of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 29
+      },
+      "hiddenSeries": false,
+      "id": 17,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Total GPU Usage",
+          "refId": "GPU Usage"
+        },
+        {
+          "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Total GPU Capacity",
+          "refId": "GPU Capacity"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Total GPU Core Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "gpuCore",
+          "label": "gpu",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {
+        "Total Capacity GPU Memory": "semi-dark-orange",
+        "Total Used GPU Memory": "semi-dark-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Total GPU memory usage across all workers of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 29
+      },
+      "hiddenSeries": false,
+      "id": 19,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Total Used GPU Memory",
+          "refId": "GPU Used Memory"
+        },
+        {
+          "exemplar": false,
+          "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+          "format": "time_series",
+          "instant": false,
+          "interval": "",
+          "legendFormat": "Total Capacity GPU Memory",
+          "refId": "GPU Capacity Memory"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Total GPU Memory Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "MiB",
+          "label": "memory",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "collapsed": false,
+      "datasource": null,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 37
+      },
+      "id": 9,
+      "panels": [],
+      "title": "Avg Worker Usage",
+      "type": "row"
+    },
+    {
+      "aliasColors": {
+        "Avg CPU Request": "semi-dark-orange",
+        "Avg CPU Usage": "semi-dark-green",
+        "Total CPU Request": "semi-dark-orange",
+        "Total CPU Usage": "semi-dark-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Avg CPU usage across all workers of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 38
+      },
+      "hiddenSeries": false,
+      "id": 23,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "exemplar": false,
+          "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+          "format": "time_series",
+          "instant": false,
+          "interval": "",
+          "legendFormat": "Avg CPU Usage",
+          "refId": "CPU Usage"
+        },
+        {
+          "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"$api_name.+\"})\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Avg CPU Request",
+          "refId": "CPU Request"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Avg CPU Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "core",
+          "label": "cpu",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {
+        "Avg Memory Request": "semi-dark-orange",
+        "Avg Memory Usage": "semi-dark-green",
+        "Total Memory Request": "semi-dark-orange",
+        "Total Memory Usage": "semi-dark-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Avg memory usage across all workers of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 38
+      },
+      "hiddenSeries": false,
+      "id": 24,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "exemplar": false,
+          "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+          "format": "time_series",
+          "instant": false,
+          "interval": "",
+          "legendFormat": "Avg Memory Usage",
+          "refId": "Memory Usage"
+        },
+        {
+          "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"$api_name.+\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Avg Memory Request",
+          "refId": "Memory Request"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Avg Memory Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "MiB",
+          "label": "memory",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {
+        "Avg GPU Capacity": "semi-dark-orange",
+        "Avg GPU Usage": "semi-dark-green",
+        "Total GPU Capacity": "semi-dark-orange",
+        "Total GPU Usage": "semi-dark-green",
+        "Total GPU Utilization": "light-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Avg GPU core usage across all workers of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 46
+      },
+      "hiddenSeries": false,
+      "id": 25,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})",
+          "hide": false,
+          "instant": false,
+          "interval": "",
+          "legendFormat": "Avg GPU Usage",
+          "refId": "GPU Usage"
+        },
+        {
+          "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) by (exported_pod))",
+          "hide": false,
+          "instant": false,
+          "interval": "",
+          "legendFormat": "Avg GPU Capacity",
+          "refId": "GPU Capacity"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Avg GPU Core Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "gpuCore",
+          "label": "gpu",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {
+        "Avg Capacity GPU Memory": "semi-dark-orange",
+        "Avg Used GPU Memory": "semi-dark-green",
+        "Total Capacity GPU Memory": "semi-dark-orange",
+        "Total Used GPU Memory": "semi-dark-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Avg GPU memory usage across all workers of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 46
+      },
+      "hiddenSeries": false,
+      "id": 26,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Avg Used GPU Memory",
+          "refId": "GPU Used Memory"
+        },
+        {
+          "exemplar": false,
+          "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+          "format": "time_series",
+          "instant": false,
+          "interval": "",
+          "legendFormat": "Avg Capacity GPU Memory",
+          "refId": "GPU Capacity Memory"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Avg GPU Memory Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "MiB",
+          "label": "memory",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
     }
   ],
   "refresh": "30s",
@@ -362,7 +1449,7 @@
           "query": "label_values({__name__=~\"cortex_batch_.+\"}, api_name)",
           "refId": "StandardVariableQuery"
         },
-        "refresh": 1,
+        "refresh": 2,
         "regex": "",
         "skipUrlSync": false,
         "sort": 0,
diff --git a/charts/dashboards/realtime.json b/charts/dashboards/realtime.json
index d4ba134527..6ac9f89d5c 100644
--- a/charts/dashboards/realtime.json
+++ b/charts/dashboards/realtime.json
@@ -15,8 +15,7 @@
   "editable": true,
   "gnetId": null,
   "graphTooltip": 0,
-  "id": 10,
-  "iteration": 1612793050833,
+  "iteration": 1614624509947,
   "links": [],
   "panels": [
     {
@@ -38,13 +37,27 @@
         "content": "<h1 style=\"text-align: center\">RealtimeAPI</h1>",
         "mode": "markdown"
       },
-      "pluginVersion": "7.4.0",
+      "pluginVersion": "7.4.2",
       "timeFrom": null,
       "timeShift": null,
       "title": "",
       "transparent": true,
       "type": "text"
     },
+    {
+      "collapsed": false,
+      "datasource": null,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 2
+      },
+      "id": 22,
+      "panels": [],
+      "title": "API Stats",
+      "type": "row"
+    },
     {
       "aliasColors": {},
       "bars": false,
@@ -64,7 +77,7 @@
         "h": 9,
         "w": 12,
         "x": 0,
-        "y": 2
+        "y": 3
       },
       "hiddenSeries": false,
       "id": 2,
@@ -86,7 +99,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0",
+      "pluginVersion": "7.4.2",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -178,7 +191,7 @@
         "h": 9,
         "w": 12,
         "x": 12,
-        "y": 2
+        "y": 3
       },
       "hiddenSeries": false,
       "id": 4,
@@ -199,7 +212,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0",
+      "pluginVersion": "7.4.2",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -276,7 +289,7 @@
         "h": 9,
         "w": 12,
         "x": 0,
-        "y": 11
+        "y": 12
       },
       "hiddenSeries": false,
       "id": 8,
@@ -298,7 +311,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0",
+      "pluginVersion": "7.4.2",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -350,6 +363,7 @@
       },
       "yaxes": [
         {
+          "$$hashKey": "object:1217",
           "format": "reqps",
           "label": null,
           "logBase": 1,
@@ -358,6 +372,7 @@
           "show": true
         },
         {
+          "$$hashKey": "object:1218",
           "format": "short",
           "label": null,
           "logBase": 1,
@@ -389,7 +404,7 @@
         "h": 9,
         "w": 12,
         "x": 12,
-        "y": 11
+        "y": 12
       },
       "hiddenSeries": false,
       "id": 7,
@@ -410,7 +425,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0",
+      "pluginVersion": "7.4.2",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -423,7 +438,7 @@
           "expr": "count(cortex_in_flight_requests{api_name=~\"$api_name\"}) by (api_name)",
           "interval": "",
           "legendFormat": "{{api_name}}",
-          "refId": "A"
+          "refId": "Active Replicas"
         }
       ],
       "thresholds": [],
@@ -446,6 +461,7 @@
       },
       "yaxes": [
         {
+          "$$hashKey": "object:236",
           "decimals": 0,
           "format": "short",
           "label": null,
@@ -455,6 +471,7 @@
           "show": true
         },
         {
+          "$$hashKey": "object:237",
           "format": "short",
           "label": null,
           "logBase": 1,
@@ -487,7 +504,7 @@
         "h": 9,
         "w": 12,
         "x": 0,
-        "y": 20
+        "y": 21
       },
       "hiddenSeries": false,
       "id": 9,
@@ -509,7 +526,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0",
+      "pluginVersion": "7.4.2",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -601,7 +618,7 @@
         "h": 9,
         "w": 12,
         "x": 12,
-        "y": 20
+        "y": 21
       },
       "hiddenSeries": false,
       "id": 10,
@@ -623,7 +640,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0",
+      "pluginVersion": "7.4.2",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -715,7 +732,7 @@
         "h": 9,
         "w": 12,
         "x": 0,
-        "y": 29
+        "y": 30
       },
       "hiddenSeries": false,
       "id": 6,
@@ -736,7 +753,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0",
+      "pluginVersion": "7.4.2",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -788,6 +805,7 @@
       },
       "yaxes": [
         {
+          "$$hashKey": "object:1302",
           "format": "ms",
           "label": null,
           "logBase": 1,
@@ -796,6 +814,7 @@
           "show": true
         },
         {
+          "$$hashKey": "object:1303",
           "format": "short",
           "label": null,
           "logBase": 1,
@@ -828,7 +847,7 @@
         "h": 9,
         "w": 12,
         "x": 12,
-        "y": 29
+        "y": 30
       },
       "hiddenSeries": false,
       "id": 11,
@@ -849,7 +868,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0",
+      "pluginVersion": "7.4.2",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -924,7 +943,9 @@
       }
     },
     {
-      "aliasColors": {},
+      "aliasColors": {
+        "iris-classifier": "light-green"
+      },
       "bars": false,
       "dashLength": 10,
       "dashes": false,
@@ -942,7 +963,7 @@
         "h": 9,
         "w": 12,
         "x": 0,
-        "y": 38
+        "y": 39
       },
       "hiddenSeries": false,
       "id": 16,
@@ -963,7 +984,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0",
+      "pluginVersion": "7.4.2",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -1056,7 +1077,7 @@
         "h": 9,
         "w": 12,
         "x": 12,
-        "y": 38
+        "y": 39
       },
       "hiddenSeries": false,
       "id": 12,
@@ -1077,7 +1098,7 @@
         "alertThreshold": true
       },
       "percentage": false,
-      "pluginVersion": "7.4.0",
+      "pluginVersion": "7.4.2",
       "pointradius": 2,
       "points": false,
       "renderer": "flot",
@@ -1150,6 +1171,967 @@
         "align": false,
         "alignLevel": null
       }
+    },
+    {
+      "collapsed": false,
+      "datasource": null,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 48
+      },
+      "id": 20,
+      "panels": [],
+      "title": "Aggregate Usage",
+      "type": "row"
+    },
+    {
+      "aliasColors": {
+        "Total CPU Request": "semi-dark-orange",
+        "Total CPU Usage": "semi-dark-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Total CPU usage across all replicas of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 49
+      },
+      "hiddenSeries": false,
+      "id": 24,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "exemplar": false,
+          "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))",
+          "format": "time_series",
+          "instant": false,
+          "interval": "",
+          "legendFormat": "Total CPU Usage",
+          "refId": "CPU Usage"
+        },
+        {
+          "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"api-$api_name.+\"})",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Total CPU Request",
+          "refId": "CPU Request"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Total CPU Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "core",
+          "label": "cpu",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {
+        "Total Memory Request": "semi-dark-orange",
+        "Total Memory Usage": "semi-dark-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Total memory usage across all replicas of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 49
+      },
+      "hiddenSeries": false,
+      "id": 26,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "exemplar": false,
+          "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
+          "format": "time_series",
+          "instant": false,
+          "interval": "",
+          "legendFormat": "Total Memory Usage",
+          "refId": "Memory Usage"
+        },
+        {
+          "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"api-$api_name.+\"}) / 1024^2",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Total Memory Request",
+          "refId": "Memory Request"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Total Memory Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "MiB",
+          "label": "memory",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {
+        "Total GPU Capacity": "semi-dark-orange",
+        "Total GPU Usage": "semi-dark-green",
+        "Total GPU Utilization": "light-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Total GPU core usage across all replicas of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 57
+      },
+      "hiddenSeries": false,
+      "id": 28,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Total GPU Usage",
+          "refId": "GPU Usage"
+        },
+        {
+          "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Total GPU Capacity",
+          "refId": "GPU Capacity"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Total GPU Core Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "gpuCore",
+          "label": "gpu",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {
+        "Total Capacity GPU Memory": "semi-dark-orange",
+        "Total Used GPU Memory": "semi-dark-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Total GPU memory usage across all replicas of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 57
+      },
+      "hiddenSeries": false,
+      "id": 29,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Total Used GPU Memory",
+          "refId": "GPU Used Memory"
+        },
+        {
+          "exemplar": false,
+          "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+          "format": "time_series",
+          "instant": false,
+          "interval": "",
+          "legendFormat": "Total Capacity GPU Memory",
+          "refId": "GPU Capacity Memory"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Total GPU Memory Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "MiB",
+          "label": "memory",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "collapsed": false,
+      "datasource": null,
+      "gridPos": {
+        "h": 1,
+        "w": 24,
+        "x": 0,
+        "y": 65
+      },
+      "id": 18,
+      "panels": [],
+      "title": "Average Replica Usage",
+      "type": "row"
+    },
+    {
+      "aliasColors": {
+        "Avg CPU Request": "semi-dark-orange",
+        "Avg CPU Usage": "semi-dark-green",
+        "Total CPU Request": "semi-dark-orange",
+        "Total CPU Usage": "semi-dark-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Avg CPU usage across all replicas of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 66
+      },
+      "hiddenSeries": false,
+      "id": 30,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "exemplar": false,
+          "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+          "format": "time_series",
+          "instant": false,
+          "interval": "",
+          "legendFormat": "Avg CPU Usage",
+          "refId": "CPU Usage"
+        },
+        {
+          "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"api-$api_name.+\"})\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Avg CPU Request",
+          "refId": "CPU Request"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Avg CPU Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "core",
+          "label": "cpu",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {
+        "Avg Memory Request": "semi-dark-orange",
+        "Avg Memory Usage": "semi-dark-green",
+        "Total Memory Request": "semi-dark-orange",
+        "Total Memory Usage": "semi-dark-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Avg memory usage across all replicas of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 66
+      },
+      "hiddenSeries": false,
+      "id": 31,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "exemplar": false,
+          "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+          "format": "time_series",
+          "instant": false,
+          "interval": "",
+          "legendFormat": "Avg Memory Usage",
+          "refId": "Memory Usage"
+        },
+        {
+          "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"api-$api_name.+\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Avg Memory Request",
+          "refId": "Memory Request"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Avg Memory Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "MiB",
+          "label": "memory",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {
+        "Avg GPU Capacity": "semi-dark-orange",
+        "Avg GPU Usage": "semi-dark-green",
+        "Total GPU Capacity": "semi-dark-orange",
+        "Total GPU Utilization": "semi-dark-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Avg GPU core usage across all replicas of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 0,
+        "y": 74
+      },
+      "hiddenSeries": false,
+      "id": 32,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Avg GPU Usage",
+          "refId": "GPU Usage"
+        },
+        {
+          "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) by (exported_pod))",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Avg GPU Capacity",
+          "refId": "GPU Capacity"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Avg GPU Core Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "gpuCore",
+          "label": "gpu",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {
+        "Avg Capacity GPU Memory": "semi-dark-orange",
+        "Avg Used GPU Memory": "semi-dark-green",
+        "Total Capacity GPU Memory": "semi-dark-orange",
+        "Total Used GPU Memory": "semi-dark-green"
+      },
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Avg GPU memory usage across all replicas of the API",
+      "fieldConfig": {
+        "defaults": {
+          "color": {},
+          "custom": {},
+          "thresholds": {
+            "mode": "absolute",
+            "steps": []
+          }
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 12,
+        "y": 74
+      },
+      "hiddenSeries": false,
+      "id": 33,
+      "legend": {
+        "avg": false,
+        "current": false,
+        "max": false,
+        "min": false,
+        "show": true,
+        "total": false,
+        "values": false
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.2",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "Avg Used GPU Memory",
+          "refId": "GPU Used Memory"
+        },
+        {
+          "exemplar": false,
+          "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+          "format": "time_series",
+          "instant": false,
+          "interval": "",
+          "legendFormat": "Avg Capacity GPU Memory",
+          "refId": "GPU Capacity Memory"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Avg GPU Memory Usage",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:1404",
+          "format": "MiB",
+          "label": "memory",
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:1405",
+          "format": "short",
+          "label": "",
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": false
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
     }
   ],
   "refresh": "30s",
@@ -1195,5 +2177,5 @@
   "timezone": "",
   "title": "RealtimeAPI",
   "uid": "realtimeapi",
-  "version": 1
+  "version": 4
 }
diff --git a/charts/templates/clusterconfig.yaml b/charts/templates/clusterconfig.yaml
index 960d560a9e..c995388074 100644
--- a/charts/templates/clusterconfig.yaml
+++ b/charts/templates/clusterconfig.yaml
@@ -54,6 +54,7 @@ data:
     image_prometheus_config_reloader: {{ .Values.cortex.image_prometheus_config_reloader }}
     image_prometheus_operator: {{ .Values.cortex.image_prometheus_operator }}
     image_prometheus_statsd_exporter: {{ .Values.cortex.image_prometheus_statsd_exporter }}
+    image_prometheus_kube_state_metrics: {{ .Values.cortex.image_prometheus_kube_state_metrics }}
     image_prometheus_to_cloudwatch: {{ .Values.cortex.image_prometheus_to_cloudwatch }}
 ---
 {{- else if eq .Values.global.provider "gcp" }}
@@ -85,6 +86,7 @@ data:
     image_prometheus_config_reloader: {{ .Values.cortex.image_prometheus_config_reloader }}
     image_prometheus_operator: {{ .Values.cortex.image_prometheus_operator }}
     image_prometheus_statsd_exporter: {{ .Values.cortex.image_prometheus_statsd_exporter }}
+    image_prometheus_kube_state_metrics: {{ .Values.cortex.image_prometheus_kube_state_metrics }}
     image_prometheus_stackdriver_sidecar: {{ .Values.cortex.image_prometheus_stackdriver_sidecar }}
 ---
 {{- end }}
diff --git a/charts/templates/prometheus-kube-state-metrics.yaml b/charts/templates/prometheus-kube-state-metrics.yaml
new file mode 100644
index 0000000000..5879869352
--- /dev/null
+++ b/charts/templates/prometheus-kube-state-metrics.yaml
@@ -0,0 +1,264 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  labels:
+    app.kubernetes.io/name: kube-state-metrics
+  name: kube-state-metrics
+  namespace: {{ .Release.Namespace }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/name: kube-state-metrics
+  name: kube-state-metrics
+rules:
+
+- apiGroups: ["certificates.k8s.io"]
+  resources:
+  - certificatesigningrequests
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - configmaps
+  verbs: ["list", "watch"]
+
+- apiGroups: ["batch"]
+  resources:
+  - cronjobs
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "apps"]
+  resources:
+  - daemonsets
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "apps"]
+  resources:
+  - deployments
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - endpoints
+  verbs: ["list", "watch"]
+
+- apiGroups: ["autoscaling"]
+  resources:
+  - horizontalpodautoscalers
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "networking.k8s.io"]
+  resources:
+  - ingresses
+  verbs: ["list", "watch"]
+
+- apiGroups: ["batch"]
+  resources:
+  - jobs
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - limitranges
+  verbs: ["list", "watch"]
+
+- apiGroups: ["admissionregistration.k8s.io"]
+  resources:
+    - mutatingwebhookconfigurations
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - namespaces
+  verbs: ["list", "watch"]
+
+- apiGroups: ["networking.k8s.io"]
+  resources:
+  - networkpolicies
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - nodes
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - persistentvolumeclaims
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - persistentvolumes
+  verbs: ["list", "watch"]
+
+- apiGroups: ["policy"]
+  resources:
+    - poddisruptionbudgets
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - pods
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "apps"]
+  resources:
+  - replicasets
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - replicationcontrollers
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - resourcequotas
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - secrets
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - services
+  verbs: ["list", "watch"]
+
+- apiGroups: ["apps"]
+  resources:
+  - statefulsets
+  verbs: ["list", "watch"]
+
+- apiGroups: ["storage.k8s.io"]
+  resources:
+    - storageclasses
+  verbs: ["list", "watch"]
+
+- apiGroups: ["admissionregistration.k8s.io"]
+  resources:
+    - validatingwebhookconfigurations
+  verbs: ["list", "watch"]
+
+- apiGroups: ["storage.k8s.io"]
+  resources:
+    - volumeattachments
+  verbs: ["list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels:
+    app.kubernetes.io/name: kube-state-metrics
+  name: kube-state-metrics
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: kube-state-metrics
+subjects:
+- kind: ServiceAccount
+  name: kube-state-metrics
+  namespace: {{ .Release.Namespace }}
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: kube-state-metrics
+  namespace: {{ .Release.Namespace }}
+  labels:
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/version: "1.9.8"
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: kube-state-metrics
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: kube-state-metrics
+    spec:
+      hostNetwork: false
+      serviceAccountName: kube-state-metrics
+      securityContext:
+        fsGroup: 65534
+        runAsGroup: 65534
+        runAsUser: 65534
+      containers:
+      - name: kube-state-metrics
+        resources:
+          requests:
+            cpu: 300m
+            memory: 400Mi
+        args:
+        - --collectors=certificatesigningrequests
+        - --collectors=configmaps
+        - --collectors=cronjobs
+        - --collectors=daemonsets
+        - --collectors=deployments
+        - --collectors=endpoints
+        - --collectors=horizontalpodautoscalers
+        - --collectors=ingresses
+        - --collectors=jobs
+        - --collectors=limitranges
+        - --collectors=mutatingwebhookconfigurations
+        - --collectors=namespaces
+        - --collectors=networkpolicies
+        - --collectors=nodes
+        - --collectors=persistentvolumeclaims
+        - --collectors=persistentvolumes
+        - --collectors=poddisruptionbudgets
+        - --collectors=pods
+        - --collectors=replicasets
+        - --collectors=replicationcontrollers
+        - --collectors=resourcequotas
+        - --collectors=secrets
+        - --collectors=services
+        - --collectors=statefulsets
+        - --collectors=storageclasses
+        - --collectors=validatingwebhookconfigurations
+        - --collectors=volumeattachments
+        - --telemetry-port=8081
+        imagePullPolicy: Always
+        image: {{ .Values.cortex.image_prometheus_kube_state_metrics }}
+        ports:
+        - containerPort: 8080
+          name: metrics
+          protocol: TCP
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 8080
+          initialDelaySeconds: 5
+          timeoutSeconds: 5
+        readinessProbe:
+          httpGet:
+            path: /
+            port: 8080
+          initialDelaySeconds: 5
+          timeoutSeconds: 5
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: kube-state-metrics
+  namespace: {{ .Release.Namespace }}
+  labels:
+    name: kube-state-metrics
+    monitoring.cortex.dev: kube-state-metrics
+spec:
+  jobLabel: "kube-state-metrics"
+  podMetricsEndpoints:
+    - port: metrics
+      scheme: http
+      path: /metrics
+      interval: 30s
+  namespaceSelector:
+    any: true
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: kube-state-metrics
diff --git a/charts/templates/prometheus-kubelet-exporter.yaml b/charts/templates/prometheus-kubelet-exporter.yaml
new file mode 100644
index 0000000000..8784c1e949
--- /dev/null
+++ b/charts/templates/prometheus-kubelet-exporter.yaml
@@ -0,0 +1,91 @@
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  labels:
+    k8s-app: kubelet
+    monitoring.cortex.dev: kubelet-exporter
+  name: kubelet
+  namespace: {{ .Release.Namespace }}
+spec:
+  endpoints:
+  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    honorLabels: true
+    interval: 30s
+    metricRelabelings:
+    - action: drop
+      regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: transformation_(transformation_latencies_microseconds|failures_total)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
+      sourceLabels:
+      - __name__
+    port: https-metrics
+    relabelings:
+    - sourceLabels:
+      - __metrics_path__
+      targetLabel: metrics_path
+    scheme: https
+    tlsConfig:
+      insecureSkipVerify: true
+  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    honorLabels: true
+    honorTimestamps: false
+    interval: 30s
+    metricRelabelings:
+    - action: drop
+      regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
+      sourceLabels:
+      - __name__
+    path: /metrics/cadvisor
+    port: https-metrics
+    relabelings:
+    - sourceLabels:
+      - __metrics_path__
+      targetLabel: metrics_path
+    scheme: https
+    tlsConfig:
+      insecureSkipVerify: true
+  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    honorLabels: true
+    interval: 30s
+    path: /metrics/probes
+    port: https-metrics
+    relabelings:
+    - sourceLabels:
+      - __metrics_path__
+      targetLabel: metrics_path
+    scheme: https
+    tlsConfig:
+      insecureSkipVerify: true
+  jobLabel: k8s-app
+  namespaceSelector:
+    matchNames:
+    - kube-system
+  selector:
+    matchLabels:
+      k8s-app: kubelet
diff --git a/charts/templates/prometheus-monitoring.yaml b/charts/templates/prometheus-monitoring.yaml
index 18a0a9e11b..1b211cb8bb 100644
--- a/charts/templates/prometheus-monitoring.yaml
+++ b/charts/templates/prometheus-monitoring.yaml
@@ -27,12 +27,12 @@ spec:
     matchExpressions:
       - key: "monitoring.cortex.dev"
         operator: "In"
-        values: [ "istio", "request-monitor", "statsd-exporter" ]
+        values: [ "istio", "request-monitor", "statsd-exporter", "dcgm-exporter", "kube-state-metrics" ]
   serviceMonitorSelector:
     matchExpressions:
       - key: "monitoring.cortex.dev"
         operator: "In"
-        values: [ "node-exporter" ]
+        values: [ "kubelet-exporter", "node-exporter" ]
   ruleSelector:
     matchLabels:
       prometheus: k8s
diff --git a/charts/values.yaml b/charts/values.yaml
index 2548d81d03..72492e2bef 100644
--- a/charts/values.yaml
+++ b/charts/values.yaml
@@ -28,6 +28,7 @@ cortex:
   image_prometheus_config_reloader: quay.io/cortexlabs/prometheus-config-reloader:master
   image_prometheus_operator: quay.io/cortexlabs/prometheus-operator:master
   image_prometheus_statsd_exporter: quay.io/cortexlabs/prometheus-statsd-exporter:master
+  image_prometheus_kube_state_metrics: quay.io/cortexlabs/prometheus-kube-state-metrics:master
   image_prometheus_node_exporter: quay.io/cortexlabs/prometheus-node-exporter:master
   image_kube_rbac_proxy: quay.io/cortexlabs/kube-rbac-proxy:master
   image_grafana: quay.io/cortexlabs/grafana:master
diff --git a/cli/cmd/lib_cluster_config_aws.go b/cli/cmd/lib_cluster_config_aws.go
index 7c9e7a79c4..d30f59f3d9 100644
--- a/cli/cmd/lib_cluster_config_aws.go
+++ b/cli/cmd/lib_cluster_config_aws.go
@@ -424,6 +424,14 @@ func setConfigFieldsFromCached(userClusterConfig *clusterconfig.Config, cachedCl
 		return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImagePrometheusStatsDExporterKey, cachedClusterConfig.ImagePrometheusStatsDExporter)
 	}
 
+	if s.Obj(cachedClusterConfig.ImagePrometheusDCGMExporter) != s.Obj(userClusterConfig.ImagePrometheusDCGMExporter) {
+		return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImagePrometheusDCGMExporterKey, cachedClusterConfig.ImagePrometheusDCGMExporter)
+	}
+
+	if s.Obj(cachedClusterConfig.ImagePrometheusKubeStateMetrics) != s.Obj(userClusterConfig.ImagePrometheusKubeStateMetrics) {
+		return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImagePrometheusKubeStateMetricsKey, cachedClusterConfig.ImagePrometheusKubeStateMetrics)
+	}
+
 	if s.Obj(cachedClusterConfig.ImagePrometheusNodeExporter) != s.Obj(userClusterConfig.ImagePrometheusNodeExporter) {
 		return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImagePrometheusNodeExporterKey, cachedClusterConfig.ImagePrometheusNodeExporter)
 	}
@@ -748,6 +756,12 @@ func clusterConfigConfirmationStr(clusterConfig clusterconfig.Config) string {
 	if clusterConfig.ImagePrometheusStatsDExporter != defaultConfig.ImagePrometheusStatsDExporter {
 		items.Add(clusterconfig.ImagePrometheusStatsDExporterUserKey, clusterConfig.ImagePrometheusStatsDExporter)
 	}
+	if clusterConfig.ImagePrometheusDCGMExporter != defaultConfig.ImagePrometheusDCGMExporter {
+		items.Add(clusterconfig.ImagePrometheusDCGMExporterUserKey, clusterConfig.ImagePrometheusDCGMExporter)
+	}
+	if clusterConfig.ImagePrometheusKubeStateMetrics != defaultConfig.ImagePrometheusKubeStateMetrics {
+		items.Add(clusterconfig.ImagePrometheusKubeStateMetricsUserKey, clusterConfig.ImagePrometheusKubeStateMetrics)
+	}
 	if clusterConfig.ImageGrafana != defaultConfig.ImageGrafana {
 		items.Add(clusterconfig.ImageGrafanaUserKey, clusterConfig.ImageGrafana)
 	}
diff --git a/dev/versions.md b/dev/versions.md
index 902af66338..611f76033a 100644
--- a/dev/versions.md
+++ b/dev/versions.md
@@ -323,6 +323,22 @@ supported (<https://github.com/awslabs/amazon-eks-ami/issues/176>)
 1. Update the base image version in `images/prometheus-statsd-exporter/Dockerfile`.
 1. Update `prometheus-statsd-exporter.yaml` as necessary, if that's the case.
 
+## Prometheus DCGM Exporter
+
+1. Run `helm template` on the DCGM charts https://github.com/NVIDIA/gpu-monitoring-tools/tree/master/deployment/dcgm-exporter and save the output somewhere temporarily.
+1. Update the base image version in `images/prometheus-dcgm-exporter/Dockerfile`.
+1. Update `prometheus-dcgm-exporter.yaml` as necessary, if that's the case. Keep in mind that in our k8s template, the `ServiceMonitor` was changed to a `PodMonitor`. Remove any unnecessary labels.
+
+## Prometheus kube-state-metrics Exporter
+
+1. Run `helm template` on the kube-state-metrics charts https://github.com/kubernetes/kube-state-metrics/tree/master/charts/kube-state-metrics and save the output somewhere temporarily.
+1. Update the base image version in `images/prometheus-kube-state-metrics-exporter/Dockerfile`.
+1. Update `prometheus-kube-state-metrics-exporter.yaml` as necessary, if that's the case. Keep in mind that in our k8s template, the `ServiceMonitor` was changed to a `PodMonitor`. Remove any unnecessary labels. The update can also include adjusting the resource requests.
+
+## Prometheus Kubelet Exporter
+
+1. Check if https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/kubernetes-serviceMonitorKubelet.yaml has changed when compared to `manager/manifests/prometheus-kubelet-exporter`.
+
 ## Prometheus Node Exporter
 
 1. Find the latest release in the Kube Prometheus [GitHub Repo](https://github.com/prometheus-operator/kube-prometheus/blob/main/manifests/).
diff --git a/docs/clusters/aws/install.md b/docs/clusters/aws/install.md
index 1532f598aa..34df6935f7 100644
--- a/docs/clusters/aws/install.md
+++ b/docs/clusters/aws/install.md
@@ -108,6 +108,8 @@ image_prometheus: quay.io/cortexlabs/prometheus:master
 image_prometheus_config_reloader: quay.io/cortexlabs/prometheus-config-reloader:master
 image_prometheus_operator: quay.io/cortexlabs/prometheus-operator:master
 image_prometheus_statsd_exporter: quay.io/cortexlabs/prometheus-statsd-exporter:master
+image_prometheus_dcgm_exporter: quay.io/cortexlabs/prometheus-dcgm-exporter:master
+image_prometheus_kube_state_metrics_exporter: quay.io/cortexlabs/prometheus-kube-state-metrics-exporter:master
 image_prometheus_node_exporter: quay.io/cortexlabs/prometheus-node-exporter:master
 image_kube_rbac_proxy: quay.io/cortexlabs/kube-rbac-proxy:master
 image_grafana: quay.io/cortexlabs/grafana:master
diff --git a/docs/clusters/gcp/install.md b/docs/clusters/gcp/install.md
index be3e043b22..5c6294a701 100644
--- a/docs/clusters/gcp/install.md
+++ b/docs/clusters/gcp/install.md
@@ -82,6 +82,8 @@ image_prometheus: quay.io/cortexlabs/prometheus:master
 image_prometheus_config_reloader: quay.io/cortexlabs/prometheus-config-reloader:master
 image_prometheus_operator: quay.io/cortexlabs/prometheus-operator:master
 image_prometheus_statsd_exporter: quay.io/cortexlabs/prometheus-statsd-exporter:master
+image_prometheus_dcgm_exporter: quay.io/cortexlabs/prometheus-dcgm-exporter:master
+image_prometheus_kube_state_metrics_exporter: quay.io/cortexlabs/prometheus-kube-state-metrics-exporter:master
 image_prometheus_node_exporter: quay.io/cortexlabs/prometheus-node-exporter:master
 image_kube_rbac_proxy: quay.io/cortexlabs/kube-rbac-proxy:master
 image_grafana: quay.io/cortexlabs/grafana:master
diff --git a/images/prometheus-dcgm-exporter/Dockerfile b/images/prometheus-dcgm-exporter/Dockerfile
new file mode 100644
index 0000000000..175a1a35d6
--- /dev/null
+++ b/images/prometheus-dcgm-exporter/Dockerfile
@@ -0,0 +1 @@
+FROM nvidia/dcgm-exporter:2.0.13-2.1.1-ubuntu18.04
diff --git a/images/prometheus-kube-state-metrics/Dockerfile b/images/prometheus-kube-state-metrics/Dockerfile
new file mode 100644
index 0000000000..f0f37fb6f8
--- /dev/null
+++ b/images/prometheus-kube-state-metrics/Dockerfile
@@ -0,0 +1 @@
+FROM k8s.gcr.io/kube-state-metrics/kube-state-metrics:v1.9.8
diff --git a/manager/install.sh b/manager/install.sh
index 5a9917f16c..087d09026b 100755
--- a/manager/install.sh
+++ b/manager/install.sh
@@ -74,6 +74,7 @@ function cluster_up_aws() {
   if [[ "$CORTEX_INSTANCE_TYPE" == p* ]] || [[ "$CORTEX_INSTANCE_TYPE" == g* ]]; then
     echo -n "￮ configuring gpu support "
     envsubst < manifests/nvidia_aws.yaml | kubectl apply -f - >/dev/null
+    envsubst < manifests/prometheus-dcgm-exporter.yaml | kubectl apply -f - >/dev/null
     echo "✓"
   fi
 
@@ -132,6 +133,7 @@ function cluster_up_gcp() {
   if [ -n "$CORTEX_ACCELERATOR_TYPE" ]; then
     echo -n "￮ configuring gpu support "
     envsubst < manifests/nvidia_gcp.yaml | kubectl apply -f - >/dev/null
+    envsubst < manifests/prometheus-dcgm-exporter.yaml | kubectl apply -f - >/dev/null
     echo "✓"
   fi
 
@@ -303,6 +305,8 @@ function setup_secrets() {
 function setup_prometheus() {
   envsubst < manifests/prometheus-operator.yaml | kubectl apply -f - >/dev/null
   envsubst < manifests/prometheus-statsd-exporter.yaml | kubectl apply -f - >/dev/null
+  envsubst < manifests/prometheus-kubelet-exporter.yaml | kubectl apply -f - >/dev/null
+  envsubst < manifests/prometheus-kube-state-metrics.yaml | kubectl apply -f - >/dev/null
   envsubst < manifests/prometheus-node-exporter.yaml | kubectl apply -f - >/dev/null
   python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-monitoring.yaml.j2 | kubectl apply -f - >/dev/null
 }
diff --git a/manager/manifests/grafana/grafana-dashboard-batch.yaml b/manager/manifests/grafana/grafana-dashboard-batch.yaml
index 2624506a9e..f82fff3f21 100644
--- a/manager/manifests/grafana/grafana-dashboard-batch.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-batch.yaml
@@ -36,6 +36,7 @@ data:
       "editable": true,
       "gnetId": null,
       "graphTooltip": 0,
+      "iteration": 1614622843373,
       "links": [],
       "panels": [
         {
@@ -57,13 +58,25 @@ data:
             "content": "<h1 style=\"text-align: center\">BatchAPI</h1>\n",
             "mode": "markdown"
           },
-          "pluginVersion": "7.4.0",
+          "pluginVersion": "7.4.2",
           "timeFrom": null,
           "timeShift": null,
           "title": "",
           "transparent": true,
           "type": "text"
         },
+        {
+          "datasource": null,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 2
+          },
+          "id": 22,
+          "title": "API Stats",
+          "type": "row"
+        },
         {
           "aliasColors": {},
           "bars": false,
@@ -83,7 +96,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 2
+            "y": 3
           },
           "hiddenSeries": false,
           "id": 2,
@@ -104,7 +117,7 @@ data:
             "alertThreshold": true
           },
           "percentage": false,
-          "pluginVersion": "7.4.0",
+          "pluginVersion": "7.4.2",
           "pointradius": 2,
           "points": false,
           "renderer": "flot",
@@ -183,7 +196,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 2
+            "y": 3
           },
           "hiddenSeries": false,
           "id": 3,
@@ -204,7 +217,7 @@ data:
             "alertThreshold": true
           },
           "percentage": false,
-          "pluginVersion": "7.4.0",
+          "pluginVersion": "7.4.2",
           "pointradius": 2,
           "points": false,
           "renderer": "flot",
@@ -240,6 +253,7 @@ data:
           },
           "yaxes": [
             {
+              "$$hashKey": "object:262",
               "decimals": 0,
               "format": "short",
               "label": null,
@@ -249,6 +263,7 @@ data:
               "show": true
             },
             {
+              "$$hashKey": "object:263",
               "format": "short",
               "label": null,
               "logBase": 1,
@@ -280,8 +295,8 @@ data:
           "gridPos": {
             "h": 8,
             "w": 12,
-            "x": 6,
-            "y": 11
+            "x": 0,
+            "y": 12
           },
           "hiddenSeries": false,
           "id": 5,
@@ -302,7 +317,7 @@ data:
             "alertThreshold": true
           },
           "percentage": false,
-          "pluginVersion": "7.4.0",
+          "pluginVersion": "7.4.2",
           "pointradius": 2,
           "points": false,
           "renderer": "flot",
@@ -358,6 +373,1078 @@ data:
             "align": false,
             "alignLevel": null
           }
+        },
+        {
+          "aliasColors": {
+            "Active Jobs": "semi-dark-green",
+            "Active Workers": "semi-dark-orange"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Active jobs/workers",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {}
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 12
+          },
+          "hiddenSeries": false,
+          "id": 20,
+          "legend": {
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
+            "show": true,
+            "total": false,
+            "values": true
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "count(kube_job_status_active{job_name=~\"$api_name.+\"})",
+              "interval": "",
+              "legendFormat": "Active Jobs",
+              "refId": "Active Batches"
+            },
+            {
+              "expr": "sum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Active Workers",
+              "refId": "Active Workers"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "# Active Jobs/Workers",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:128",
+              "decimals": 0,
+              "format": "count",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:129",
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "datasource": null,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 20
+          },
+          "id": 11,
+          "title": "Aggregate Worker Usage",
+          "type": "row"
+        },
+        {
+          "aliasColors": {
+            "Total CPU Request": "semi-dark-orange",
+            "Total CPU Usage": "semi-dark-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Total CPU usage across all workers of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 21
+          },
+          "hiddenSeries": false,
+          "id": 13,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": false,
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "legendFormat": "Total CPU Usage",
+              "refId": "CPU Usage"
+            },
+            {
+              "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"$api_name.+\"})",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Total CPU Request",
+              "refId": "CPU Request"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Total CPU Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "core",
+              "label": "cpu",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {
+            "Total Memory Request": "semi-dark-orange",
+            "Total Memory Usage": "semi-dark-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Total memory usage across all workers of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 21
+          },
+          "hiddenSeries": false,
+          "id": 15,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": false,
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "legendFormat": "Total Memory Usage",
+              "refId": "Memory Usage"
+            },
+            {
+              "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"$api_name.+\"}) / 1024^2",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Total Memory Request",
+              "refId": "Memory Request"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Total Memory Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "MiB",
+              "label": "memory",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {
+            "Total GPU Capacity": "semi-dark-orange",
+            "Total GPU Usage": "semi-dark-green",
+            "Total GPU Utilization": "light-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Total GPU core usage across all workers of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 29
+          },
+          "hiddenSeries": false,
+          "id": 17,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Total GPU Usage",
+              "refId": "GPU Usage"
+            },
+            {
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Total GPU Capacity",
+              "refId": "GPU Capacity"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Total GPU Core Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "gpuCore",
+              "label": "gpu",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {
+            "Total Capacity GPU Memory": "semi-dark-orange",
+            "Total Used GPU Memory": "semi-dark-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Total GPU memory usage across all workers of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 29
+          },
+          "hiddenSeries": false,
+          "id": 19,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Total Used GPU Memory",
+              "refId": "GPU Used Memory"
+            },
+            {
+              "exemplar": false,
+              "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "legendFormat": "Total Capacity GPU Memory",
+              "refId": "GPU Capacity Memory"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Total GPU Memory Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "MiB",
+              "label": "memory",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "collapsed": false,
+          "datasource": null,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 37
+          },
+          "id": 9,
+          "panels": [],
+          "title": "Avg Worker Usage",
+          "type": "row"
+        },
+        {
+          "aliasColors": {
+            "Avg CPU Request": "semi-dark-orange",
+            "Avg CPU Usage": "semi-dark-green",
+            "Total CPU Request": "semi-dark-orange",
+            "Total CPU Usage": "semi-dark-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Avg CPU usage across all workers of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 38
+          },
+          "hiddenSeries": false,
+          "id": 23,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": false,
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "legendFormat": "Avg CPU Usage",
+              "refId": "CPU Usage"
+            },
+            {
+              "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"$api_name.+\"})\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Avg CPU Request",
+              "refId": "CPU Request"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Avg CPU Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "core",
+              "label": "cpu",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {
+            "Avg Memory Request": "semi-dark-orange",
+            "Avg Memory Usage": "semi-dark-green",
+            "Total Memory Request": "semi-dark-orange",
+            "Total Memory Usage": "semi-dark-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Avg memory usage across all workers of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 38
+          },
+          "hiddenSeries": false,
+          "id": 24,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": false,
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "legendFormat": "Avg Memory Usage",
+              "refId": "Memory Usage"
+            },
+            {
+              "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"$api_name.+\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"$api_name.+\"})",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Avg Memory Request",
+              "refId": "Memory Request"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Avg Memory Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "MiB",
+              "label": "memory",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {
+            "Avg GPU Capacity": "semi-dark-orange",
+            "Avg GPU Usage": "semi-dark-green",
+            "Total GPU Capacity": "semi-dark-orange",
+            "Total GPU Usage": "semi-dark-green",
+            "Total GPU Utilization": "light-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Avg GPU core usage across all workers of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 46
+          },
+          "hiddenSeries": false,
+          "id": 25,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})",
+              "hide": false,
+              "instant": false,
+              "interval": "",
+              "legendFormat": "Avg GPU Usage",
+              "refId": "GPU Usage"
+            },
+            {
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"$api_name.+\"}) by (exported_pod))",
+              "hide": false,
+              "instant": false,
+              "interval": "",
+              "legendFormat": "Avg GPU Capacity",
+              "refId": "GPU Capacity"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Avg GPU Core Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "gpuCore",
+              "label": "gpu",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {
+            "Avg Capacity GPU Memory": "semi-dark-orange",
+            "Avg Used GPU Memory": "semi-dark-green",
+            "Total Capacity GPU Memory": "semi-dark-orange",
+            "Total Used GPU Memory": "semi-dark-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Avg GPU memory usage across all workers of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 46
+          },
+          "hiddenSeries": false,
+          "id": 26,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Avg Used GPU Memory",
+              "refId": "GPU Used Memory"
+            },
+            {
+              "exemplar": false,
+              "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"$api_name.+\"})",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "legendFormat": "Avg Capacity GPU Memory",
+              "refId": "GPU Capacity Memory"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Avg GPU Memory Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "MiB",
+              "label": "memory",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
         }
       ],
       "refresh": "30s",
@@ -383,7 +1470,7 @@ data:
               "query": "label_values({__name__=~\"cortex_batch_.+\"}, api_name)",
               "refId": "StandardVariableQuery"
             },
-            "refresh": 1,
+            "refresh": 2,
             "regex": "",
             "skipUrlSync": false,
             "sort": 0,
diff --git a/manager/manifests/grafana/grafana-dashboard-realtime.yaml b/manager/manifests/grafana/grafana-dashboard-realtime.yaml
index 41984d4120..1dbdfaca83 100644
--- a/manager/manifests/grafana/grafana-dashboard-realtime.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-realtime.yaml
@@ -36,8 +36,7 @@ data:
       "editable": true,
       "gnetId": null,
       "graphTooltip": 0,
-      "id": 10,
-      "iteration": 1612793050833,
+      "iteration": 1614624509947,
       "links": [],
       "panels": [
         {
@@ -59,13 +58,27 @@ data:
             "content": "<h1 style=\"text-align: center\">RealtimeAPI</h1>",
             "mode": "markdown"
           },
-          "pluginVersion": "7.4.0",
+          "pluginVersion": "7.4.2",
           "timeFrom": null,
           "timeShift": null,
           "title": "",
           "transparent": true,
           "type": "text"
         },
+        {
+          "collapsed": false,
+          "datasource": null,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 2
+          },
+          "id": 22,
+          "panels": [],
+          "title": "API Stats",
+          "type": "row"
+        },
         {
           "aliasColors": {},
           "bars": false,
@@ -85,7 +98,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 2
+            "y": 3
           },
           "hiddenSeries": false,
           "id": 2,
@@ -107,7 +120,7 @@ data:
             "alertThreshold": true
           },
           "percentage": false,
-          "pluginVersion": "7.4.0",
+          "pluginVersion": "7.4.2",
           "pointradius": 2,
           "points": false,
           "renderer": "flot",
@@ -199,7 +212,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 2
+            "y": 3
           },
           "hiddenSeries": false,
           "id": 4,
@@ -220,7 +233,7 @@ data:
             "alertThreshold": true
           },
           "percentage": false,
-          "pluginVersion": "7.4.0",
+          "pluginVersion": "7.4.2",
           "pointradius": 2,
           "points": false,
           "renderer": "flot",
@@ -297,7 +310,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 11
+            "y": 12
           },
           "hiddenSeries": false,
           "id": 8,
@@ -319,7 +332,7 @@ data:
             "alertThreshold": true
           },
           "percentage": false,
-          "pluginVersion": "7.4.0",
+          "pluginVersion": "7.4.2",
           "pointradius": 2,
           "points": false,
           "renderer": "flot",
@@ -371,6 +384,7 @@ data:
           },
           "yaxes": [
             {
+              "$$hashKey": "object:1217",
               "format": "reqps",
               "label": null,
               "logBase": 1,
@@ -379,6 +393,7 @@ data:
               "show": true
             },
             {
+              "$$hashKey": "object:1218",
               "format": "short",
               "label": null,
               "logBase": 1,
@@ -410,7 +425,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 11
+            "y": 12
           },
           "hiddenSeries": false,
           "id": 7,
@@ -431,7 +446,7 @@ data:
             "alertThreshold": true
           },
           "percentage": false,
-          "pluginVersion": "7.4.0",
+          "pluginVersion": "7.4.2",
           "pointradius": 2,
           "points": false,
           "renderer": "flot",
@@ -444,7 +459,7 @@ data:
               "expr": "count(cortex_in_flight_requests{api_name=~\"$api_name\"}) by (api_name)",
               "interval": "",
               "legendFormat": "{{api_name}}",
-              "refId": "A"
+              "refId": "Active Replicas"
             }
           ],
           "thresholds": [],
@@ -467,6 +482,7 @@ data:
           },
           "yaxes": [
             {
+              "$$hashKey": "object:236",
               "decimals": 0,
               "format": "short",
               "label": null,
@@ -476,6 +492,7 @@ data:
               "show": true
             },
             {
+              "$$hashKey": "object:237",
               "format": "short",
               "label": null,
               "logBase": 1,
@@ -508,7 +525,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 20
+            "y": 21
           },
           "hiddenSeries": false,
           "id": 9,
@@ -530,7 +547,7 @@ data:
             "alertThreshold": true
           },
           "percentage": false,
-          "pluginVersion": "7.4.0",
+          "pluginVersion": "7.4.2",
           "pointradius": 2,
           "points": false,
           "renderer": "flot",
@@ -622,7 +639,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 20
+            "y": 21
           },
           "hiddenSeries": false,
           "id": 10,
@@ -644,7 +661,7 @@ data:
             "alertThreshold": true
           },
           "percentage": false,
-          "pluginVersion": "7.4.0",
+          "pluginVersion": "7.4.2",
           "pointradius": 2,
           "points": false,
           "renderer": "flot",
@@ -736,7 +753,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 29
+            "y": 30
           },
           "hiddenSeries": false,
           "id": 6,
@@ -757,7 +774,7 @@ data:
             "alertThreshold": true
           },
           "percentage": false,
-          "pluginVersion": "7.4.0",
+          "pluginVersion": "7.4.2",
           "pointradius": 2,
           "points": false,
           "renderer": "flot",
@@ -809,6 +826,7 @@ data:
           },
           "yaxes": [
             {
+              "$$hashKey": "object:1302",
               "format": "ms",
               "label": null,
               "logBase": 1,
@@ -817,6 +835,7 @@ data:
               "show": true
             },
             {
+              "$$hashKey": "object:1303",
               "format": "short",
               "label": null,
               "logBase": 1,
@@ -849,7 +868,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 29
+            "y": 30
           },
           "hiddenSeries": false,
           "id": 11,
@@ -870,7 +889,7 @@ data:
             "alertThreshold": true
           },
           "percentage": false,
-          "pluginVersion": "7.4.0",
+          "pluginVersion": "7.4.2",
           "pointradius": 2,
           "points": false,
           "renderer": "flot",
@@ -945,7 +964,9 @@ data:
           }
         },
         {
-          "aliasColors": {},
+          "aliasColors": {
+            "iris-classifier": "light-green"
+          },
           "bars": false,
           "dashLength": 10,
           "dashes": false,
@@ -963,7 +984,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 38
+            "y": 39
           },
           "hiddenSeries": false,
           "id": 16,
@@ -984,7 +1005,7 @@ data:
             "alertThreshold": true
           },
           "percentage": false,
-          "pluginVersion": "7.4.0",
+          "pluginVersion": "7.4.2",
           "pointradius": 2,
           "points": false,
           "renderer": "flot",
@@ -1077,7 +1098,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 38
+            "y": 39
           },
           "hiddenSeries": false,
           "id": 12,
@@ -1098,7 +1119,7 @@ data:
             "alertThreshold": true
           },
           "percentage": false,
-          "pluginVersion": "7.4.0",
+          "pluginVersion": "7.4.2",
           "pointradius": 2,
           "points": false,
           "renderer": "flot",
@@ -1171,6 +1192,967 @@ data:
             "align": false,
             "alignLevel": null
           }
+        },
+        {
+          "collapsed": false,
+          "datasource": null,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 48
+          },
+          "id": 20,
+          "panels": [],
+          "title": "Aggregate Usage",
+          "type": "row"
+        },
+        {
+          "aliasColors": {
+            "Total CPU Request": "semi-dark-orange",
+            "Total CPU Usage": "semi-dark-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Total CPU usage across all replicas of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 49
+          },
+          "hiddenSeries": false,
+          "id": 24,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": false,
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "legendFormat": "Total CPU Usage",
+              "refId": "CPU Usage"
+            },
+            {
+              "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"api-$api_name.+\"})",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Total CPU Request",
+              "refId": "CPU Request"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Total CPU Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "core",
+              "label": "cpu",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {
+            "Total Memory Request": "semi-dark-orange",
+            "Total Memory Usage": "semi-dark-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Total memory usage across all replicas of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 49
+          },
+          "hiddenSeries": false,
+          "id": 26,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": false,
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "legendFormat": "Total Memory Usage",
+              "refId": "Memory Usage"
+            },
+            {
+              "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"api-$api_name.+\"}) / 1024^2",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Total Memory Request",
+              "refId": "Memory Request"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Total Memory Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "MiB",
+              "label": "memory",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {
+            "Total GPU Capacity": "semi-dark-orange",
+            "Total GPU Usage": "semi-dark-green",
+            "Total GPU Utilization": "light-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Total GPU core usage across all replicas of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 57
+          },
+          "hiddenSeries": false,
+          "id": 28,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Total GPU Usage",
+              "refId": "GPU Usage"
+            },
+            {
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Total GPU Capacity",
+              "refId": "GPU Capacity"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Total GPU Core Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "gpuCore",
+              "label": "gpu",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {
+            "Total Capacity GPU Memory": "semi-dark-orange",
+            "Total Used GPU Memory": "semi-dark-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Total GPU memory usage across all replicas of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 57
+          },
+          "hiddenSeries": false,
+          "id": 29,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Total Used GPU Memory",
+              "refId": "GPU Used Memory"
+            },
+            {
+              "exemplar": false,
+              "expr": "sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "legendFormat": "Total Capacity GPU Memory",
+              "refId": "GPU Capacity Memory"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Total GPU Memory Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "MiB",
+              "label": "memory",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "collapsed": false,
+          "datasource": null,
+          "gridPos": {
+            "h": 1,
+            "w": 24,
+            "x": 0,
+            "y": 65
+          },
+          "id": 18,
+          "panels": [],
+          "title": "Average Replica Usage",
+          "type": "row"
+        },
+        {
+          "aliasColors": {
+            "Avg CPU Request": "semi-dark-orange",
+            "Avg CPU Usage": "semi-dark-green",
+            "Total CPU Request": "semi-dark-orange",
+            "Total CPU Usage": "semi-dark-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Avg CPU usage across all replicas of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 66
+          },
+          "hiddenSeries": false,
+          "id": 30,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": false,
+              "expr": "sum(rate(container_cpu_usage_seconds_total{pod=~\"api-$api_name.+\", container!=\"POD\", name!=\"\"}[1m]))\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "legendFormat": "Avg CPU Usage",
+              "refId": "CPU Usage"
+            },
+            {
+              "expr": "sum(kube_pod_container_resource_requests_cpu_cores{exported_pod=~\"api-$api_name.+\"})\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Avg CPU Request",
+              "refId": "CPU Request"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Avg CPU Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "core",
+              "label": "cpu",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {
+            "Avg Memory Request": "semi-dark-orange",
+            "Avg Memory Usage": "semi-dark-green",
+            "Total Memory Request": "semi-dark-orange",
+            "Total Memory Usage": "semi-dark-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Avg memory usage across all replicas of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 66
+          },
+          "hiddenSeries": false,
+          "id": 31,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "exemplar": false,
+              "expr": "sum(sum_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container!=\"POD\"}[1m]))\n/\nsum(count_over_time(container_memory_working_set_bytes{pod=~\"api-$api_name.+\", name!=\"\", container=\"api\"}[1m])) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "legendFormat": "Avg Memory Usage",
+              "refId": "Memory Usage"
+            },
+            {
+              "expr": "sum(kube_pod_container_resource_requests_memory_bytes{exported_pod=~\"api-$api_name.+\"}) / 1024^2\n/\nsum(kube_pod_info{exported_pod=~\"api-$api_name.+\"})",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Avg Memory Request",
+              "refId": "Memory Request"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Avg Memory Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "MiB",
+              "label": "memory",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {
+            "Avg GPU Capacity": "semi-dark-orange",
+            "Avg GPU Usage": "semi-dark-green",
+            "Total GPU Capacity": "semi-dark-orange",
+            "Total GPU Utilization": "semi-dark-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Avg GPU core usage across all replicas of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 74
+          },
+          "hiddenSeries": false,
+          "id": 32,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) / 100\n/\ncount(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Avg GPU Usage",
+              "refId": "GPU Usage"
+            },
+            {
+              "expr": "count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"})\n/\ncount(count(DCGM_FI_DEV_GPU_UTIL{exported_pod=~\"api-$api_name.+\"}) by (exported_pod))",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Avg GPU Capacity",
+              "refId": "GPU Capacity"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Avg GPU Core Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "gpuCore",
+              "label": "gpu",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {
+            "Avg Capacity GPU Memory": "semi-dark-orange",
+            "Avg Used GPU Memory": "semi-dark-green",
+            "Total Capacity GPU Memory": "semi-dark-orange",
+            "Total Used GPU Memory": "semi-dark-green"
+          },
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Avg GPU memory usage across all replicas of the API",
+          "fieldConfig": {
+            "defaults": {
+              "color": {},
+              "custom": {},
+              "thresholds": {
+                "mode": "absolute",
+                "steps": []
+              }
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 74
+          },
+          "hiddenSeries": false,
+          "id": 33,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.2",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "hide": false,
+              "interval": "",
+              "legendFormat": "Avg Used GPU Memory",
+              "refId": "GPU Used Memory"
+            },
+            {
+              "exemplar": false,
+              "expr": "(sum(DCGM_FI_DEV_FB_FREE{exported_pod=~\"api-$api_name.+\"}) + sum(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"}))\n/\ncount(DCGM_FI_DEV_FB_USED{exported_pod=~\"api-$api_name.+\"})",
+              "format": "time_series",
+              "instant": false,
+              "interval": "",
+              "legendFormat": "Avg Capacity GPU Memory",
+              "refId": "GPU Capacity Memory"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Avg GPU Memory Usage",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "$$hashKey": "object:1404",
+              "format": "MiB",
+              "label": "memory",
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "$$hashKey": "object:1405",
+              "format": "short",
+              "label": "",
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": false
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
         }
       ],
       "refresh": "30s",
@@ -1216,5 +2198,5 @@ data:
       "timezone": "",
       "title": "RealtimeAPI",
       "uid": "realtimeapi",
-      "version": 1
+      "version": 4
     }
diff --git a/manager/manifests/prometheus-dcgm-exporter.yaml b/manager/manifests/prometheus-dcgm-exporter.yaml
new file mode 100644
index 0000000000..27ec085d6f
--- /dev/null
+++ b/manager/manifests/prometheus-dcgm-exporter.yaml
@@ -0,0 +1,136 @@
+# Copyright 2021 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: monitoring
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: dcgm-exporter
+  namespace: default
+  labels:
+    app.kubernetes.io/name: dcgm-exporter
+    app.kubernetes.io/instance: dcgm-exporter
+    app.kubernetes.io/component: dcgm-exporter
+---
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: dcgm-exporter
+  namespace: default
+  labels:
+    app.kubernetes.io/name: dcgm-exporter
+    app.kubernetes.io/instance: dcgm-exporter
+    app.kubernetes.io/component: dcgm-exporter
+spec:
+  updateStrategy:
+    type: RollingUpdate
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: dcgm-exporter
+      app.kubernetes.io/instance: dcgm-exporter
+      app.kubernetes.io/component: dcgm-exporter
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: dcgm-exporter
+        app.kubernetes.io/instance: dcgm-exporter
+        app.kubernetes.io/component: dcgm-exporter
+    spec:
+      serviceAccountName: dcgm-exporter
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: nvidia.com/gpu
+                operator: Exists
+      containers:
+      - env:
+        - name: DCGM_EXPORTER_LISTEN
+          value: :9400
+        - name: DCGM_EXPORTER_KUBERNETES
+          value: "true"
+        image: $CORTEX_IMAGE_PROMETHEUS_DCGM_EXPORTER
+        imagePullPolicy: Always
+        name: dcgm-exporter
+        ports:
+        - containerPort: 9400
+          name: metrics
+          protocol: TCP
+        resources:
+          requests:
+            cpu: 50m
+            memory: 50Mi
+        securityContext:
+          privileged: true
+        terminationMessagePath: /dev/termination-log
+        terminationMessagePolicy: File
+        volumeMounts:
+        - mountPath: /var/lib/kubelet/pod-resources
+          name: pod-gpu-resources
+          readOnly: true
+        - mountPath: /usr/local/nvidia
+          name: nvidia-install-dir-host
+      dnsPolicy: ClusterFirst
+      restartPolicy: Always
+      schedulerName: default-scheduler
+      securityContext: {}
+      terminationGracePeriodSeconds: 30
+      tolerations:
+      - key: workload
+        effect: NoSchedule
+        operator: Exists
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      volumes:
+      - hostPath:
+          path: /var/lib/kubelet/pod-resources
+          type: ""
+        name: pod-gpu-resources
+      - hostPath:
+          path: /home/kubernetes/bin/nvidia
+          type: ""
+        name: nvidia-install-dir-host
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: dcgm-exporter
+  namespace: default
+  labels:
+    monitoring.cortex.dev: dcgm-exporter
+    app.kubernetes.io/name: dcgm-exporter
+    app.kubernetes.io/instance: dcgm-exporter
+    app.kubernetes.io/component: dcgm-exporter
+  annotations:
+    prometheus.io/scrape: 'true'
+    prometheus.io/port: '9400'
+spec:
+  jobLabel: "dcgm-exporter"
+  podMetricsEndpoints:
+    - port: metrics
+      path: /metrics
+      scheme: http
+      interval: 15s
+  namespaceSelector:
+    any: true
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: dcgm-exporter
+      app.kubernetes.io/instance: dcgm-exporter
diff --git a/manager/manifests/prometheus-kube-state-metrics.yaml b/manager/manifests/prometheus-kube-state-metrics.yaml
new file mode 100644
index 0000000000..135e5ffdf8
--- /dev/null
+++ b/manager/manifests/prometheus-kube-state-metrics.yaml
@@ -0,0 +1,277 @@
+# Copyright 2021 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  labels:
+    app.kubernetes.io/name: kube-state-metrics
+  name: kube-state-metrics
+  namespace: default
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  labels:
+    app.kubernetes.io/name: kube-state-metrics
+  name: kube-state-metrics
+rules:
+
+- apiGroups: ["certificates.k8s.io"]
+  resources:
+  - certificatesigningrequests
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - configmaps
+  verbs: ["list", "watch"]
+
+- apiGroups: ["batch"]
+  resources:
+  - cronjobs
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "apps"]
+  resources:
+  - daemonsets
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "apps"]
+  resources:
+  - deployments
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - endpoints
+  verbs: ["list", "watch"]
+
+- apiGroups: ["autoscaling"]
+  resources:
+  - horizontalpodautoscalers
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "networking.k8s.io"]
+  resources:
+  - ingresses
+  verbs: ["list", "watch"]
+
+- apiGroups: ["batch"]
+  resources:
+  - jobs
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - limitranges
+  verbs: ["list", "watch"]
+
+- apiGroups: ["admissionregistration.k8s.io"]
+  resources:
+    - mutatingwebhookconfigurations
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - namespaces
+  verbs: ["list", "watch"]
+
+- apiGroups: ["networking.k8s.io"]
+  resources:
+  - networkpolicies
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - nodes
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - persistentvolumeclaims
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - persistentvolumes
+  verbs: ["list", "watch"]
+
+- apiGroups: ["policy"]
+  resources:
+    - poddisruptionbudgets
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - pods
+  verbs: ["list", "watch"]
+
+- apiGroups: ["extensions", "apps"]
+  resources:
+  - replicasets
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - replicationcontrollers
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - resourcequotas
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - secrets
+  verbs: ["list", "watch"]
+
+- apiGroups: [""]
+  resources:
+  - services
+  verbs: ["list", "watch"]
+
+- apiGroups: ["apps"]
+  resources:
+  - statefulsets
+  verbs: ["list", "watch"]
+
+- apiGroups: ["storage.k8s.io"]
+  resources:
+    - storageclasses
+  verbs: ["list", "watch"]
+
+- apiGroups: ["admissionregistration.k8s.io"]
+  resources:
+    - validatingwebhookconfigurations
+  verbs: ["list", "watch"]
+
+- apiGroups: ["storage.k8s.io"]
+  resources:
+    - volumeattachments
+  verbs: ["list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  labels:
+    app.kubernetes.io/name: kube-state-metrics
+  name: kube-state-metrics
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: kube-state-metrics
+subjects:
+- kind: ServiceAccount
+  name: kube-state-metrics
+  namespace: default
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: kube-state-metrics
+  namespace: default
+  labels:
+    app.kubernetes.io/name: kube-state-metrics
+    app.kubernetes.io/version: "1.9.8"
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: kube-state-metrics
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: kube-state-metrics
+    spec:
+      hostNetwork: false
+      serviceAccountName: kube-state-metrics
+      securityContext:
+        fsGroup: 65534
+        runAsGroup: 65534
+        runAsUser: 65534
+      containers:
+      - name: kube-state-metrics
+        resources:
+          requests:
+            cpu: 300m
+            memory: 400Mi
+        args:
+        - --collectors=certificatesigningrequests
+        - --collectors=configmaps
+        - --collectors=cronjobs
+        - --collectors=daemonsets
+        - --collectors=deployments
+        - --collectors=endpoints
+        - --collectors=horizontalpodautoscalers
+        - --collectors=ingresses
+        - --collectors=jobs
+        - --collectors=limitranges
+        - --collectors=mutatingwebhookconfigurations
+        - --collectors=namespaces
+        - --collectors=networkpolicies
+        - --collectors=nodes
+        - --collectors=persistentvolumeclaims
+        - --collectors=persistentvolumes
+        - --collectors=poddisruptionbudgets
+        - --collectors=pods
+        - --collectors=replicasets
+        - --collectors=replicationcontrollers
+        - --collectors=resourcequotas
+        - --collectors=secrets
+        - --collectors=services
+        - --collectors=statefulsets
+        - --collectors=storageclasses
+        - --collectors=validatingwebhookconfigurations
+        - --collectors=volumeattachments
+        - --telemetry-port=8081
+        imagePullPolicy: Always
+        image: $CORTEX_IMAGE_PROMETHEUS_KUBE_STATE_METRICS
+        ports:
+        - containerPort: 8080
+          name: metrics
+          protocol: TCP
+        livenessProbe:
+          httpGet:
+            path: /healthz
+            port: 8080
+          initialDelaySeconds: 5
+          timeoutSeconds: 5
+        readinessProbe:
+          httpGet:
+            path: /
+            port: 8080
+          initialDelaySeconds: 5
+          timeoutSeconds: 5
+---
+apiVersion: monitoring.coreos.com/v1
+kind: PodMonitor
+metadata:
+  name: kube-state-metrics
+  namespace: default
+  labels:
+    name: kube-state-metrics
+    monitoring.cortex.dev: kube-state-metrics
+spec:
+  jobLabel: "kube-state-metrics"
+  podMetricsEndpoints:
+    - port: metrics
+      scheme: http
+      path: /metrics
+      interval: 30s
+  namespaceSelector:
+    any: true
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: kube-state-metrics
diff --git a/manager/manifests/prometheus-kubelet-exporter.yaml b/manager/manifests/prometheus-kubelet-exporter.yaml
new file mode 100644
index 0000000000..8677500ef9
--- /dev/null
+++ b/manager/manifests/prometheus-kubelet-exporter.yaml
@@ -0,0 +1,105 @@
+# Copyright 2021 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  labels:
+    k8s-app: kubelet
+    monitoring.cortex.dev: kubelet-exporter
+  name: kubelet
+  namespace: default
+spec:
+  endpoints:
+  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    honorLabels: true
+    interval: 30s
+    metricRelabelings:
+    - action: drop
+      regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: transformation_(transformation_latencies_microseconds|failures_total)
+      sourceLabels:
+      - __name__
+    - action: drop
+      regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
+      sourceLabels:
+      - __name__
+    port: https-metrics
+    relabelings:
+    - sourceLabels:
+      - __metrics_path__
+      targetLabel: metrics_path
+    scheme: https
+    tlsConfig:
+      insecureSkipVerify: true
+  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    honorLabels: true
+    honorTimestamps: false
+    interval: 30s
+    metricRelabelings:
+    - action: drop
+      regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
+      sourceLabels:
+      - __name__
+    path: /metrics/cadvisor
+    port: https-metrics
+    relabelings:
+    - sourceLabels:
+      - __metrics_path__
+      targetLabel: metrics_path
+    scheme: https
+    tlsConfig:
+      insecureSkipVerify: true
+  - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
+    honorLabels: true
+    interval: 30s
+    path: /metrics/probes
+    port: https-metrics
+    relabelings:
+    - sourceLabels:
+      - __metrics_path__
+      targetLabel: metrics_path
+    scheme: https
+    tlsConfig:
+      insecureSkipVerify: true
+  jobLabel: k8s-app
+  namespaceSelector:
+    matchNames:
+    - kube-system
+  selector:
+    matchLabels:
+      k8s-app: kubelet
diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2
index eb5b2d4c21..d10bf75cd9 100644
--- a/manager/manifests/prometheus-monitoring.yaml.j2
+++ b/manager/manifests/prometheus-monitoring.yaml.j2
@@ -40,12 +40,12 @@ spec:
     matchExpressions:
       - key: "monitoring.cortex.dev"
         operator: "In"
-        values: [ "istio", "request-monitor", "statsd-exporter" ]
+        values: [ "istio", "request-monitor", "statsd-exporter", "dcgm-exporter", "kube-state-metrics" ]
   serviceMonitorSelector:
     matchExpressions:
       - key: "monitoring.cortex.dev"
         operator: "In"
-        values: [ "node-exporter" ]
+        values: [ "kubelet-exporter", "node-exporter" ]
   ruleSelector:
     matchLabels:
       prometheus: k8s
diff --git a/pkg/operator/operator/k8s.go b/pkg/operator/operator/k8s.go
index 0c6689fbe1..c1e8aacc68 100644
--- a/pkg/operator/operator/k8s.go
+++ b/pkg/operator/operator/k8s.go
@@ -207,13 +207,17 @@ func PythonPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Volume
 	if api.Compute.Inf == 0 {
 		if api.Compute.CPU != nil {
 			userPodCPURequest := k8s.QuantityPtr(api.Compute.CPU.Quantity.DeepCopy())
-			userPodCPURequest.Sub(_requestMonitorCPURequest)
+			if api.Kind == userconfig.RealtimeAPIKind {
+				userPodCPURequest.Sub(_requestMonitorCPURequest)
+			}
 			apiPodResourceList[kcore.ResourceCPU] = *userPodCPURequest
 		}
 
 		if api.Compute.Mem != nil {
 			userPodMemRequest := k8s.QuantityPtr(api.Compute.Mem.Quantity.DeepCopy())
-			userPodMemRequest.Sub(_requestMonitorMemRequest)
+			if api.Kind == userconfig.RealtimeAPIKind {
+				userPodMemRequest.Sub(_requestMonitorMemRequest)
+			}
 			apiPodResourceList[kcore.ResourceMemory] = *userPodMemRequest
 		}
 
@@ -236,7 +240,9 @@ func PythonPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Volume
 
 		if api.Compute.CPU != nil {
 			userPodCPURequest := k8s.QuantityPtr(api.Compute.CPU.Quantity.DeepCopy())
-			userPodCPURequest.Sub(_requestMonitorCPURequest)
+			if api.Kind == userconfig.RealtimeAPIKind {
+				userPodCPURequest.Sub(_requestMonitorCPURequest)
+			}
 			q1, q2 := k8s.SplitInTwo(userPodCPURequest)
 			apiPodResourceList[kcore.ResourceCPU] = *q1
 			neuronContainer.Resources.Requests[kcore.ResourceCPU] = *q2
@@ -244,7 +250,9 @@ func PythonPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Volume
 
 		if api.Compute.Mem != nil {
 			userPodMemRequest := k8s.QuantityPtr(api.Compute.Mem.Quantity.DeepCopy())
-			userPodMemRequest.Sub(_requestMonitorMemRequest)
+			if api.Kind == userconfig.RealtimeAPIKind {
+				userPodMemRequest.Sub(_requestMonitorMemRequest)
+			}
 			q1, q2 := k8s.SplitInTwo(userPodMemRequest)
 			apiPodResourceList[kcore.ResourceMemory] = *q1
 			neuronContainer.Resources.Requests[kcore.ResourceMemory] = *q2
@@ -304,7 +312,9 @@ func TensorFlowPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Vo
 	if api.Compute.Inf == 0 {
 		if api.Compute.CPU != nil {
 			userPodCPURequest := k8s.QuantityPtr(api.Compute.CPU.Quantity.DeepCopy())
-			userPodCPURequest.Sub(_requestMonitorCPURequest)
+			if api.Kind == userconfig.RealtimeAPIKind {
+				userPodCPURequest.Sub(_requestMonitorCPURequest)
+			}
 			q1, q2 := k8s.SplitInTwo(userPodCPURequest)
 			apiResourceList[kcore.ResourceCPU] = *q1
 			tfServingResourceList[kcore.ResourceCPU] = *q2
@@ -312,7 +322,9 @@ func TensorFlowPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Vo
 
 		if api.Compute.Mem != nil {
 			userPodMemRequest := k8s.QuantityPtr(api.Compute.Mem.Quantity.DeepCopy())
-			userPodMemRequest.Sub(_requestMonitorMemRequest)
+			if api.Kind == userconfig.RealtimeAPIKind {
+				userPodMemRequest.Sub(_requestMonitorMemRequest)
+			}
 			q1, q2 := k8s.SplitInTwo(userPodMemRequest)
 			apiResourceList[kcore.ResourceMemory] = *q1
 			tfServingResourceList[kcore.ResourceMemory] = *q2
@@ -338,7 +350,9 @@ func TensorFlowPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Vo
 
 		if api.Compute.CPU != nil {
 			userPodCPURequest := k8s.QuantityPtr(api.Compute.CPU.Quantity.DeepCopy())
-			userPodCPURequest.Sub(_requestMonitorCPURequest)
+			if api.Kind == userconfig.RealtimeAPIKind {
+				userPodCPURequest.Sub(_requestMonitorCPURequest)
+			}
 			q1, q2, q3 := k8s.SplitInThree(userPodCPURequest)
 			apiResourceList[kcore.ResourceCPU] = *q1
 			tfServingResourceList[kcore.ResourceCPU] = *q2
@@ -347,7 +361,9 @@ func TensorFlowPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Vo
 
 		if api.Compute.Mem != nil {
 			userPodMemRequest := k8s.QuantityPtr(api.Compute.Mem.Quantity.DeepCopy())
-			userPodMemRequest.Sub(_requestMonitorMemRequest)
+			if api.Kind == userconfig.RealtimeAPIKind {
+				userPodMemRequest.Sub(_requestMonitorMemRequest)
+			}
 			q1, q2, q3 := k8s.SplitInThree(userPodMemRequest)
 			apiResourceList[kcore.ResourceMemory] = *q1
 			tfServingResourceList[kcore.ResourceMemory] = *q2
@@ -413,13 +429,17 @@ func ONNXPredictorContainers(api *spec.API) ([]kcore.Container, []kcore.Volume)
 
 	if api.Compute.CPU != nil {
 		userPodCPURequest := k8s.QuantityPtr(api.Compute.CPU.Quantity.DeepCopy())
-		userPodCPURequest.Sub(_requestMonitorCPURequest)
+		if api.Kind == userconfig.RealtimeAPIKind {
+			userPodCPURequest.Sub(_requestMonitorCPURequest)
+		}
 		resourceList[kcore.ResourceCPU] = *userPodCPURequest
 	}
 
 	if api.Compute.Mem != nil {
 		userPodMemRequest := k8s.QuantityPtr(api.Compute.Mem.Quantity.DeepCopy())
-		userPodMemRequest.Sub(_requestMonitorMemRequest)
+		if api.Kind == userconfig.RealtimeAPIKind {
+			userPodMemRequest.Sub(_requestMonitorMemRequest)
+		}
 		resourceList[kcore.ResourceMemory] = *userPodMemRequest
 	}
 
@@ -958,6 +978,16 @@ func RequestMonitorContainer(api *spec.API) kcore.Container {
 		image = config.GCPCoreConfig.ImageRequestMonitor
 	}
 
+	requests := kcore.ResourceList{}
+	if api.Compute != nil {
+		if api.Compute.CPU != nil {
+			requests[kcore.ResourceCPU] = _requestMonitorCPURequest
+		}
+		if api.Compute.Mem != nil {
+			requests[kcore.ResourceMemory] = _requestMonitorMemRequest
+		}
+	}
+
 	return kcore.Container{
 		Name:            _requestMonitorContainerName,
 		Image:           image,
@@ -971,10 +1001,7 @@ func RequestMonitorContainer(api *spec.API) kcore.Container {
 		VolumeMounts:   defaultVolumeMounts(),
 		ReadinessProbe: FileExistsProbe(_requestMonitorReadinessFile),
 		Resources: kcore.ResourceRequirements{
-			Requests: kcore.ResourceList{
-				kcore.ResourceCPU:    _requestMonitorCPURequest,
-				kcore.ResourceMemory: _requestMonitorMemRequest,
-			},
+			Requests: requests,
 		},
 	}
 }
diff --git a/pkg/operator/resources/validations.go b/pkg/operator/resources/validations.go
index 2bd6d9abc9..d71eb6bd61 100644
--- a/pkg/operator/resources/validations.go
+++ b/pkg/operator/resources/validations.go
@@ -168,6 +168,9 @@ var _cortexMemReserve = kresource.MustParse("1230Mi")
 var _nvidiaCPUReserve = kresource.MustParse("100m")
 var _nvidiaMemReserve = kresource.MustParse("100Mi")
 
+var _nvidiaDCGMExporterCPUReserve = kresource.MustParse("50m")
+var _nvidiaDCGMExporterMemReserve = kresource.MustParse("50Mi")
+
 var _inferentiaCPUReserve = kresource.MustParse("100m")
 var _inferentiaMemReserve = kresource.MustParse("100Mi")
 
@@ -187,6 +190,9 @@ func awsManagedValidateK8sCompute(compute *userconfig.Compute, maxMem kresource.
 		// Reserve resources for nvidia device plugin daemonset
 		maxCPU.Sub(_nvidiaCPUReserve)
 		maxMem.Sub(_nvidiaMemReserve)
+		// Reserve resources for nvidia dcgm prometheus exporter
+		maxCPU.Sub(_nvidiaDCGMExporterCPUReserve)
+		maxMem.Sub(_nvidiaDCGMExporterMemReserve)
 	}
 
 	maxInf := instanceMetadata.Inf
diff --git a/pkg/types/clusterconfig/cluster_config_aws.go b/pkg/types/clusterconfig/cluster_config_aws.go
index 98f3460173..dae26b0f61 100644
--- a/pkg/types/clusterconfig/cluster_config_aws.go
+++ b/pkg/types/clusterconfig/cluster_config_aws.go
@@ -63,26 +63,28 @@ type CoreConfig struct {
 	Namespace      string             `json:"namespace" yaml:"namespace"`
 	IstioNamespace string             `json:"istio_namespace" yaml:"istio_namespace"`
 
-	ImageOperator                 string `json:"image_operator" yaml:"image_operator"`
-	ImageManager                  string `json:"image_manager" yaml:"image_manager"`
-	ImageDownloader               string `json:"image_downloader" yaml:"image_downloader"`
-	ImageRequestMonitor           string `json:"image_request_monitor" yaml:"image_request_monitor"`
-	ImageClusterAutoscaler        string `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"`
-	ImageMetricsServer            string `json:"image_metrics_server" yaml:"image_metrics_server"`
-	ImageInferentia               string `json:"image_inferentia" yaml:"image_inferentia"`
-	ImageNeuronRTD                string `json:"image_neuron_rtd" yaml:"image_neuron_rtd"`
-	ImageNvidia                   string `json:"image_nvidia" yaml:"image_nvidia"`
-	ImageFluentBit                string `json:"image_fluent_bit" yaml:"image_fluent_bit"`
-	ImageIstioProxy               string `json:"image_istio_proxy" yaml:"image_istio_proxy"`
-	ImageIstioPilot               string `json:"image_istio_pilot" yaml:"image_istio_pilot"`
-	ImagePrometheus               string `json:"image_prometheus" yaml:"image_prometheus"`
-	ImagePrometheusConfigReloader string `json:"image_prometheus_config_reloader" yaml:"image_prometheus_config_reloader"`
-	ImagePrometheusOperator       string `json:"image_prometheus_operator" yaml:"image_prometheus_operator"`
-	ImagePrometheusStatsDExporter string `json:"image_prometheus_statsd_exporter" yaml:"image_prometheus_statsd_exporter"`
-	ImagePrometheusNodeExporter   string `json:"image_prometheus_node_exporter" yaml:"image_prometheus_node_exporter"`
-	ImageKubeRBACProxy            string `json:"image_kube_rbac_proxy" yaml:"image_kube_rbac_proxy"`
-	ImageGrafana                  string `json:"image_grafana" yaml:"image_grafana"`
-	ImageEventExporter            string `json:"image_event_exporter" yaml:"image_event_exporter"`
+	ImageOperator                   string `json:"image_operator" yaml:"image_operator"`
+	ImageManager                    string `json:"image_manager" yaml:"image_manager"`
+	ImageDownloader                 string `json:"image_downloader" yaml:"image_downloader"`
+	ImageRequestMonitor             string `json:"image_request_monitor" yaml:"image_request_monitor"`
+	ImageClusterAutoscaler          string `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"`
+	ImageMetricsServer              string `json:"image_metrics_server" yaml:"image_metrics_server"`
+	ImageInferentia                 string `json:"image_inferentia" yaml:"image_inferentia"`
+	ImageNeuronRTD                  string `json:"image_neuron_rtd" yaml:"image_neuron_rtd"`
+	ImageNvidia                     string `json:"image_nvidia" yaml:"image_nvidia"`
+	ImageFluentBit                  string `json:"image_fluent_bit" yaml:"image_fluent_bit"`
+	ImageIstioProxy                 string `json:"image_istio_proxy" yaml:"image_istio_proxy"`
+	ImageIstioPilot                 string `json:"image_istio_pilot" yaml:"image_istio_pilot"`
+	ImagePrometheus                 string `json:"image_prometheus" yaml:"image_prometheus"`
+	ImagePrometheusConfigReloader   string `json:"image_prometheus_config_reloader" yaml:"image_prometheus_config_reloader"`
+	ImagePrometheusOperator         string `json:"image_prometheus_operator" yaml:"image_prometheus_operator"`
+	ImagePrometheusStatsDExporter   string `json:"image_prometheus_statsd_exporter" yaml:"image_prometheus_statsd_exporter"`
+	ImagePrometheusDCGMExporter     string `json:"image_prometheus_dcgm_exporter" yaml:"image_prometheus_dcgm_exporter"`
+	ImagePrometheusKubeStateMetrics string `json:"image_prometheus_kube_state_metrics" yaml:"image_prometheus_kube_state_metrics"`
+	ImagePrometheusNodeExporter     string `json:"image_prometheus_node_exporter" yaml:"image_prometheus_node_exporter"`
+	ImageKubeRBACProxy              string `json:"image_kube_rbac_proxy" yaml:"image_kube_rbac_proxy"`
+	ImageGrafana                    string `json:"image_grafana" yaml:"image_grafana"`
+	ImageEventExporter              string `json:"image_event_exporter" yaml:"image_event_exporter"`
 }
 
 type ManagedConfig struct {
@@ -333,6 +335,20 @@ var CoreConfigStructFieldValidations = []*cr.StructFieldValidation{
 			Validator: validateImageVersion,
 		},
 	},
+	{
+		StructField: "ImagePrometheusDCGMExporter",
+		StringValidation: &cr.StringValidation{
+			Default:   "quay.io/cortexlabs/prometheus-dcgm-exporter:" + consts.CortexVersion,
+			Validator: validateImageVersion,
+		},
+	},
+	{
+		StructField: "ImagePrometheusKubeStateMetrics",
+		StringValidation: &cr.StringValidation{
+			Default:   "quay.io/cortexlabs/prometheus-kube-state-metrics:" + consts.CortexVersion,
+			Validator: validateImageVersion,
+		},
+	},
 	{
 		StructField: "ImagePrometheusNodeExporter",
 		StringValidation: &cr.StringValidation{
@@ -1317,6 +1333,8 @@ func (cc *CoreConfig) UserTable() table.KeyValuePairs {
 	items.Add(ImagePrometheusConfigReloaderUserKey, cc.ImagePrometheusConfigReloader)
 	items.Add(ImagePrometheusOperatorUserKey, cc.ImagePrometheusOperator)
 	items.Add(ImagePrometheusStatsDExporterUserKey, cc.ImagePrometheusStatsDExporter)
+	items.Add(ImagePrometheusDCGMExporterUserKey, cc.ImagePrometheusDCGMExporter)
+	items.Add(ImagePrometheusKubeStateMetricsUserKey, cc.ImagePrometheusKubeStateMetrics)
 	items.Add(ImagePrometheusNodeExporterUserKey, cc.ImagePrometheusNodeExporter)
 	items.Add(ImageKubeRBACProxyUserKey, cc.ImageKubeRBACProxy)
 	items.Add(ImageGrafanaUserKey, cc.ImageGrafana)
@@ -1452,6 +1470,12 @@ func (cc *CoreConfig) TelemetryEvent() map[string]interface{} {
 	if strings.HasPrefix(cc.ImagePrometheusStatsDExporter, "cortexlabs/") {
 		event["image_prometheus_statsd_exporter._is_custom"] = true
 	}
+	if strings.HasPrefix(cc.ImagePrometheusDCGMExporter, "cortexlabs/") {
+		event["image_prometheus_dcgm_exporter._is_custom"] = true
+	}
+	if strings.HasPrefix(cc.ImagePrometheusKubeStateMetrics, "cortexlabs/") {
+		event["image_prometheus_kube_state_metrics._is_custom"] = true
+	}
 	if strings.HasPrefix(cc.ImagePrometheusNodeExporter, "cortexlabs/") {
 		event["image_prometheus_node_exporter._is_custom"] = true
 	}
diff --git a/pkg/types/clusterconfig/cluster_config_gcp.go b/pkg/types/clusterconfig/cluster_config_gcp.go
index d0ef58c36c..fb10c50b7c 100644
--- a/pkg/types/clusterconfig/cluster_config_gcp.go
+++ b/pkg/types/clusterconfig/cluster_config_gcp.go
@@ -44,23 +44,25 @@ type GCPCoreConfig struct {
 	IsManaged      bool               `json:"is_managed" yaml:"is_managed"`
 	Bucket         string             `json:"bucket" yaml:"bucket"`
 
-	ImageOperator                 string `json:"image_operator" yaml:"image_operator"`
-	ImageManager                  string `json:"image_manager" yaml:"image_manager"`
-	ImageDownloader               string `json:"image_downloader" yaml:"image_downloader"`
-	ImageRequestMonitor           string `json:"image_request_monitor" yaml:"image_request_monitor"`
-	ImageClusterAutoscaler        string `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"`
-	ImageFluentBit                string `json:"image_fluent_bit" yaml:"image_fluent_bit"`
-	ImageIstioProxy               string `json:"image_istio_proxy" yaml:"image_istio_proxy"`
-	ImageIstioPilot               string `json:"image_istio_pilot" yaml:"image_istio_pilot"`
-	ImageGooglePause              string `json:"image_google_pause" yaml:"image_google_pause"`
-	ImagePrometheus               string `json:"image_prometheus" yaml:"image_prometheus"`
-	ImagePrometheusConfigReloader string `json:"image_prometheus_config_reloader" yaml:"image_prometheus_config_reloader"`
-	ImagePrometheusOperator       string `json:"image_prometheus_operator" yaml:"image_prometheus_operator"`
-	ImagePrometheusStatsDExporter string `json:"image_prometheus_statsd_exporter" yaml:"image_prometheus_statsd_exporter"`
-	ImagePrometheusNodeExporter   string `json:"image_prometheus_node_exporter" yaml:"image_prometheus_node_exporter"`
-	ImageKubeRBACProxy            string `json:"image_kube_rbac_proxy" yaml:"image_kube_rbac_proxy"`
-	ImageGrafana                  string `json:"image_grafana" yaml:"image_grafana"`
-	ImageEventExporter            string `json:"image_event_exporter" yaml:"image_event_exporter"`
+	ImageOperator                   string `json:"image_operator" yaml:"image_operator"`
+	ImageManager                    string `json:"image_manager" yaml:"image_manager"`
+	ImageDownloader                 string `json:"image_downloader" yaml:"image_downloader"`
+	ImageRequestMonitor             string `json:"image_request_monitor" yaml:"image_request_monitor"`
+	ImageClusterAutoscaler          string `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"`
+	ImageFluentBit                  string `json:"image_fluent_bit" yaml:"image_fluent_bit"`
+	ImageIstioProxy                 string `json:"image_istio_proxy" yaml:"image_istio_proxy"`
+	ImageIstioPilot                 string `json:"image_istio_pilot" yaml:"image_istio_pilot"`
+	ImageGooglePause                string `json:"image_google_pause" yaml:"image_google_pause"`
+	ImagePrometheus                 string `json:"image_prometheus" yaml:"image_prometheus"`
+	ImagePrometheusConfigReloader   string `json:"image_prometheus_config_reloader" yaml:"image_prometheus_config_reloader"`
+	ImagePrometheusOperator         string `json:"image_prometheus_operator" yaml:"image_prometheus_operator"`
+	ImagePrometheusStatsDExporter   string `json:"image_prometheus_statsd_exporter" yaml:"image_prometheus_statsd_exporter"`
+	ImagePrometheusDCGMExporter     string `json:"image_prometheus_dcgm_exporter" yaml:"image_prometheus_dcgm_exporter"`
+	ImagePrometheusKubeStateMetrics string `json:"image_prometheus_kube_state_metrics" yaml:"image_prometheus_kube_state_metrics"`
+	ImagePrometheusNodeExporter     string `json:"image_prometheus_node_exporter" yaml:"image_prometheus_node_exporter"`
+	ImageKubeRBACProxy              string `json:"image_kube_rbac_proxy" yaml:"image_kube_rbac_proxy"`
+	ImageGrafana                    string `json:"image_grafana" yaml:"image_grafana"`
+	ImageEventExporter              string `json:"image_event_exporter" yaml:"image_event_exporter"`
 }
 
 type GCPManagedConfig struct {
@@ -255,6 +257,20 @@ var GCPCoreConfigStructFieldValidations = []*cr.StructFieldValidation{
 			Validator: validateImageVersion,
 		},
 	},
+	{
+		StructField: "ImagePrometheusDCGMExporter",
+		StringValidation: &cr.StringValidation{
+			Default:   "quay.io/cortexlabs/prometheus-dcgm-exporter:" + consts.CortexVersion,
+			Validator: validateImageVersion,
+		},
+	},
+	{
+		StructField: "ImagePrometheusKubeStateMetrics",
+		StringValidation: &cr.StringValidation{
+			Default:   "quay.io/cortexlabs/prometheus-kube-state-metrics:" + consts.CortexVersion,
+			Validator: validateImageVersion,
+		},
+	},
 	{
 		StructField: "ImageGrafana",
 		StringValidation: &cr.StringValidation{
@@ -697,6 +713,8 @@ func (cc *GCPCoreConfig) UserTable() table.KeyValuePairs {
 	items.Add(ImagePrometheusConfigReloaderUserKey, cc.ImagePrometheusConfigReloader)
 	items.Add(ImagePrometheusOperatorUserKey, cc.ImagePrometheusOperator)
 	items.Add(ImagePrometheusStatsDExporterUserKey, cc.ImagePrometheusStatsDExporter)
+	items.Add(ImagePrometheusDCGMExporterUserKey, cc.ImagePrometheusDCGMExporter)
+	items.Add(ImagePrometheusKubeStateMetricsUserKey, cc.ImagePrometheusKubeStateMetrics)
 	items.Add(ImagePrometheusNodeExporterUserKey, cc.ImagePrometheusNodeExporter)
 	items.Add(ImageKubeRBACProxyUserKey, cc.ImageKubeRBACProxy)
 	items.Add(ImageGrafanaUserKey, cc.ImageGrafana)
@@ -805,6 +823,12 @@ func (cc *GCPCoreConfig) TelemetryEvent() map[string]interface{} {
 	if strings.HasPrefix(cc.ImagePrometheusStatsDExporter, "cortexlabs/") {
 		event["image_prometheus_statsd_exporter._is_custom"] = true
 	}
+	if strings.HasPrefix(cc.ImagePrometheusDCGMExporter, "cortexlabs/") {
+		event["image_prometheus_dcgm_exporter._is_custom"] = true
+	}
+	if strings.HasPrefix(cc.ImagePrometheusKubeStateMetrics, "cortexlabs/") {
+		event["image_prometheus_kube_state_metrics._is_custom"] = true
+	}
 	if strings.HasPrefix(cc.ImagePrometheusNodeExporter, "cortexlabs/") {
 		event["image_prometheus_node_exporter._is_custom"] = true
 	}
diff --git a/pkg/types/clusterconfig/config_key.go b/pkg/types/clusterconfig/config_key.go
index 6ca580b9b2..920b558b50 100644
--- a/pkg/types/clusterconfig/config_key.go
+++ b/pkg/types/clusterconfig/config_key.go
@@ -66,7 +66,6 @@ const (
 	ImageNeuronRTDKey                      = "image_neuron_rtd"
 	ImageNvidiaKey                         = "image_nvidia"
 	ImageFluentBitKey                      = "image_fluent_bit"
-	ImageStatsdKey                         = "image_statsd"
 	ImageIstioProxyKey                     = "image_istio_proxy"
 	ImageIstioPilotKey                     = "image_istio_pilot"
 	ImageGooglePauseKey                    = "image_google_pause"
@@ -74,6 +73,8 @@ const (
 	ImagePrometheusConfigReloaderKey       = "image_prometheus_config_reloader"
 	ImagePrometheusOperatorKey             = "image_prometheus_operator"
 	ImagePrometheusStatsDExporterKey       = "image_prometheus_statsd_exporter"
+	ImagePrometheusDCGMExporterKey         = "image_prometheus_dcgm_exporter"
+	ImagePrometheusKubeStateMetricsKey     = "image_prometheus_kube_state_metrics"
 	ImagePrometheusNodeExporterKey         = "image_prometheus_node_exporter"
 	ImageKubeRBACProxyKey                  = "image_kube_rbac_proxy"
 	ImageGrafanaKey                        = "image_grafana"
@@ -136,6 +137,8 @@ const (
 	ImagePrometheusConfigReloaderUserKey       = "prometheus config reloader image"
 	ImagePrometheusOperatorUserKey             = "prometheus operator image"
 	ImagePrometheusStatsDExporterUserKey       = "prometheus statsd exporter image"
+	ImagePrometheusDCGMExporterUserKey         = "prometheus dcgm exporter image"
+	ImagePrometheusKubeStateMetricsUserKey     = "prometheus kube-state-metrics image"
 	ImagePrometheusNodeExporterUserKey         = "prometheus node exporter image"
 	ImageKubeRBACProxyUserKey                  = "kube rbac proxy image"
 	ImageGrafanaUserKey                        = "grafana image"