From 421b7dbe0499c27917c1d73cf19bff8996629022 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Tue, 2 Feb 2021 11:31:21 +0100
Subject: [PATCH 01/26] WIP: grafana deployment

---
 manager/manifests/grafana.yaml | 87 ++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 manager/manifests/grafana.yaml

diff --git a/manager/manifests/grafana.yaml b/manager/manifests/grafana.yaml
new file mode 100644
index 0000000000..884d7ffdf3
--- /dev/null
+++ b/manager/manifests/grafana.yaml
@@ -0,0 +1,87 @@
+# Copyright 2021 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    app: grafana
+  name: grafana
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: grafana
+  template:
+    metadata:
+      labels:
+        app: grafana
+    spec:
+      containers:
+        - env: []
+          image: grafana/grafana:7.3.7
+          name: grafana
+          ports:
+            - containerPort: 3000
+              name: http
+          readinessProbe:
+            httpGet:
+              path: /api/health
+              port: http
+          resources:
+            limits:
+              cpu: 200m
+              memory: 200Mi
+            requests:
+              cpu: 100m
+              memory: 100Mi
+          volumeMounts:
+            - mountPath: /var/lib/grafana
+              name: grafana-storage
+              readOnly: false
+            - mountPath: /etc/grafana/provisioning/datasources
+              name: grafana-datasources
+              readOnly: false
+      nodeSelector:
+        beta.kubernetes.io/os: linux
+      securityContext:
+        fsGroup: 65534
+        runAsNonRoot: true
+        runAsUser: 65534
+      serviceAccountName: grafana
+      volumes:
+        - emptyDir: {}
+          name: grafana-storage
+#        - name: grafana-datasources
+#          secret:
+#            secretName: grafana-datasources
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    app: grafana
+  name: grafana
+  namespace: default
+spec:
+  ports:
+    - name: http
+      port: 3000
+      targetPort: http
+  selector:
+    app: grafana
+  type: NodePort

From 29059f7d6b823be67b0dc9a6f042a54b5a1e3ba9 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Fri, 5 Feb 2021 15:39:42 +0100
Subject: [PATCH 02/26] Add manifests for Grafana stateful set with persistent
 volume & cortex dashboard

---
 manager/manifests/grafana.yaml                |  87 --
 .../manifests/grafana/grafana-dashboard.yaml  | 919 ++++++++++++++++++
 manager/manifests/grafana/grafana.yaml        | 162 +++
 3 files changed, 1081 insertions(+), 87 deletions(-)
 delete mode 100644 manager/manifests/grafana.yaml
 create mode 100644 manager/manifests/grafana/grafana-dashboard.yaml
 create mode 100644 manager/manifests/grafana/grafana.yaml

diff --git a/manager/manifests/grafana.yaml b/manager/manifests/grafana.yaml
deleted file mode 100644
index 884d7ffdf3..0000000000
--- a/manager/manifests/grafana.yaml
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2021 Cortex Labs, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  labels:
-    app: grafana
-  name: grafana
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: grafana
-  template:
-    metadata:
-      labels:
-        app: grafana
-    spec:
-      containers:
-        - env: []
-          image: grafana/grafana:7.3.7
-          name: grafana
-          ports:
-            - containerPort: 3000
-              name: http
-          readinessProbe:
-            httpGet:
-              path: /api/health
-              port: http
-          resources:
-            limits:
-              cpu: 200m
-              memory: 200Mi
-            requests:
-              cpu: 100m
-              memory: 100Mi
-          volumeMounts:
-            - mountPath: /var/lib/grafana
-              name: grafana-storage
-              readOnly: false
-            - mountPath: /etc/grafana/provisioning/datasources
-              name: grafana-datasources
-              readOnly: false
-      nodeSelector:
-        beta.kubernetes.io/os: linux
-      securityContext:
-        fsGroup: 65534
-        runAsNonRoot: true
-        runAsUser: 65534
-      serviceAccountName: grafana
-      volumes:
-        - emptyDir: {}
-          name: grafana-storage
-#        - name: grafana-datasources
-#          secret:
-#            secretName: grafana-datasources
-
----
-
-apiVersion: v1
-kind: Service
-metadata:
-  labels:
-    app: grafana
-  name: grafana
-  namespace: default
-spec:
-  ports:
-    - name: http
-      port: 3000
-      targetPort: http
-  selector:
-    app: grafana
-  type: NodePort
diff --git a/manager/manifests/grafana/grafana-dashboard.yaml b/manager/manifests/grafana/grafana-dashboard.yaml
new file mode 100644
index 0000000000..ae56e9115c
--- /dev/null
+++ b/manager/manifests/grafana/grafana-dashboard.yaml
@@ -0,0 +1,919 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-cortex
+  namespace: default
+data:
+  cortex.json: |-
+    {
+      "annotations": {
+        "list": [
+          {
+            "builtIn": 1,
+            "datasource": "prometheus",
+            "enable": true,
+            "hide": true,
+            "iconColor": "rgba(0, 211, 255, 1)",
+            "name": "Annotations & Alerts",
+            "type": "dashboard"
+          }
+        ]
+      },
+      "editable": true,
+      "gnetId": null,
+      "graphTooltip": 0,
+      "id": 2,
+      "links": [],
+      "panels": [
+        {
+          "datasource": "prometheus",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {}
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 3,
+            "w": 24,
+            "x": 0,
+            "y": 0
+          },
+          "id": 12,
+          "options": {
+            "content": "<h1 style=\"text-align: center\">Cortex Dashboard</h1>",
+            "mode": "html"
+          },
+          "pluginVersion": "7.4.0",
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "Cortex Dashboard",
+          "transparent": true,
+          "type": "text"
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "prometheus",
+          "description": "Rate of all responses by API, for all API IDs",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {},
+              "unit": "reqps"
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 0,
+            "y": 3
+          },
+          "hiddenSeries": false,
+          "id": 5,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.0",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(rate(istio_requests_total{destination_service_name=~\"api-.*\", response_code=~\"2.*\"}[5m])) by (destination_service_name, response_code)",
+              "interval": "",
+              "legendFormat": "{{destination_service_name}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Request Rate",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "transformations": [
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "([^\\.]+)\\..+",
+                "renamePattern": "$1"
+              }
+            }
+          ],
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "reqps",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "prometheus",
+          "description": "Rate of 2XX status codes returned by API, for all API IDs",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {},
+              "unit": "reqps"
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 12,
+            "y": 3
+          },
+          "hiddenSeries": false,
+          "id": 2,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.0",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(rate(istio_requests_total{destination_service_name=~\"api-.*\", response_code=~\"2.*\"}[5m])) by (destination_service_name, response_code)",
+              "interval": "",
+              "legendFormat": "{{destination_service_name}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "2XX Responses",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "transformations": [
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "([^\\.]+)\\..+",
+                "renamePattern": "$1"
+              }
+            }
+          ],
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "reqps",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "prometheus",
+          "description": "Rate of 4XX status codes returned by API, for all API IDs",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {},
+              "unit": "reqps"
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 0,
+            "y": 12
+          },
+          "hiddenSeries": false,
+          "id": 3,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.0",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(rate(istio_requests_total{destination_service_name=~\"api-.*\", response_code=~\"4.*\"}[5m])) by (destination_service_name, response_code)",
+              "interval": "",
+              "legendFormat": "{{destination_service_name}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "4XX Responses",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "transformations": [
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "([^\\.]+)\\..+",
+                "renamePattern": "$1"
+              }
+            }
+          ],
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "reqps",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "prometheus",
+          "description": "Rate of 5XX status codes returned by API, for all API IDs",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {},
+              "unit": "reqps"
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 12,
+            "y": 12
+          },
+          "hiddenSeries": false,
+          "id": 4,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.0",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(rate(istio_requests_total{destination_service_name=~\"api-.*\", response_code=~\"5.*\"}[5m])) by (destination_service_name, response_code)",
+              "interval": "",
+              "legendFormat": "{{destination_service_name}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "5XX Responses",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "transformations": [
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "([^\\.]+)\\..+",
+                "renamePattern": "$1"
+              }
+            }
+          ],
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "reqps",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "prometheus",
+          "description": "99th percentile for request latency per API, for all API IDs",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {},
+              "unit": "ms"
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 21
+          },
+          "hiddenSeries": false,
+          "id": 7,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.0",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "histogram_quantile(0.99, sum without (response_code) (rate(istio_request_duration_milliseconds_bucket{destination_service_name=~\"api-.*\"}[5m])))",
+              "interval": "",
+              "legendFormat": "{{destination_service_name}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "p99 Latency",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "transformations": [
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "([^\\.]+)\\..+",
+                "renamePattern": "$1"
+              }
+            }
+          ],
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "ms",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "prometheus",
+          "description": "90th percentile for request latency per API, for all API IDs",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {},
+              "unit": "ms"
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 21
+          },
+          "hiddenSeries": false,
+          "id": 9,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.0",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "histogram_quantile(0.9, sum without (response_code) (rate(istio_request_duration_milliseconds_bucket{destination_service_name=~\"api-.*\"}[5m])))",
+              "interval": "",
+              "legendFormat": "{{destination_service_name}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "p90 Latency",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "transformations": [
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "([^\\.]+)\\..+",
+                "renamePattern": "$1"
+              }
+            }
+          ],
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "ms",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "prometheus",
+          "description": "50th percentile for request latency per API, for all API IDs",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {},
+              "unit": "ms"
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 29
+          },
+          "hiddenSeries": false,
+          "id": 8,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.0",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "histogram_quantile(0.5, sum without (response_code) (rate(istio_request_duration_milliseconds_bucket{destination_service_name=~\"api-.*\"}[5m])))",
+              "interval": "",
+              "legendFormat": "{{destination_service_name}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "p50 Latency",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "transformations": [
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "([^\\.]+)\\..+",
+                "renamePattern": "$1"
+              }
+            }
+          ],
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "ms",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": "prometheus",
+          "description": "Average request latency per API, for all API IDs",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {},
+              "unit": "ms"
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 29
+          },
+          "hiddenSeries": false,
+          "id": 10,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.0",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "  sum without (response_code) (rate(istio_request_duration_milliseconds_sum{destination_service_name=~\"api-.*\"}[5m]))\n/\n  sum without (response_code) (rate(istio_request_duration_milliseconds_count{destination_service_name=~\"api-.*\"}[5m]))",
+              "interval": "",
+              "legendFormat": "{{destination_service_name}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Average Latency",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "transformations": [
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "([^\\.]+)\\..+",
+                "renamePattern": "$1"
+              }
+            }
+          ],
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "ms",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        }
+      ],
+      "refresh": "30s",
+      "schemaVersion": 27,
+      "style": "dark",
+      "tags": [],
+      "templating": {
+        "list": []
+      },
+      "time": {
+        "from": "now-1h",
+        "to": "now"
+      },
+      "timepicker": {},
+      "timezone": "",
+      "title": "Cortex",
+      "uid": "ZQv3auYGk",
+      "version": 1
+    }
diff --git a/manager/manifests/grafana/grafana.yaml b/manager/manifests/grafana/grafana.yaml
new file mode 100644
index 0000000000..18904f258a
--- /dev/null
+++ b/manager/manifests/grafana/grafana.yaml
@@ -0,0 +1,162 @@
+# Copyright 2021 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-datasources
+  namespace: default
+data:
+  datasources.yaml: |
+    {
+        "apiVersion": 1,
+        "datasources": [
+            {
+                "access": "proxy",
+                "editable": false,
+                "name": "prometheus",
+                "orgId": 1,
+                "type": "prometheus",
+                "url": "http://prometheus.default:9090",
+                "version": 1
+            }
+        ]
+    }
+
+---
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboards
+  namespace: default
+data:
+  dashboards.yaml: |-
+    {
+        "apiVersion": 1,
+        "providers": [
+            {
+                "folder": "Default",
+                "name": "0",
+                "options": {
+                    "path": "/grafana-dashboard-definitions/0"
+                },
+                "disableDeletion": true,
+                "orgId": 1,
+                "type": "file"
+            }
+        ]
+    }
+
+---
+
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: grafana-storage
+  namespace: default
+spec:
+  storageClassName: ssd
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 2Gi
+
+---
+
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  labels:
+    app: grafana
+  name: grafana
+  namespace: default
+spec:
+  serviceName: grafana
+  replicas: 1
+  selector:
+    matchLabels:
+      app: grafana
+  template:
+    metadata:
+      labels:
+        app: grafana
+    spec:
+      containers:
+        - env: []
+          image: grafana/grafana:7.4.0
+          name: grafana
+          ports:
+            - containerPort: 3000
+              name: http
+          readinessProbe:
+            httpGet:
+              path: /api/health
+              port: http
+          resources:
+            limits:
+              cpu: 200m
+              memory: 200Mi
+            requests:
+              cpu: 100m
+              memory: 100Mi
+          volumeMounts:
+            - mountPath: /var/lib/grafana
+              name: grafana-storage
+              readOnly: false
+            - mountPath: /etc/grafana/provisioning/datasources
+              name: grafana-datasources
+              readOnly: false
+            - mountPath: /etc/grafana/provisioning/dashboards
+              name: grafana-dashboards
+              readOnly: false
+            - mountPath: /grafana-dashboard-definitions/0/cortex
+              name: grafana-dashboard-cortex
+              readOnly: false
+      securityContext:
+        fsGroup: 65534
+        runAsNonRoot: true
+        runAsUser: 65534
+      volumes:
+        - name: grafana-storage
+          persistentVolumeClaim:
+            claimName: grafana-storage
+        - name: grafana-datasources
+          configMap:
+            name: grafana-datasources
+        - name: grafana-dashboards
+          configMap:
+            name: grafana-dashboards
+        - name: grafana-dashboard-cortex
+          configMap:
+            name: grafana-dashboard-cortex
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    app: grafana
+  name: grafana
+  namespace: default
+spec:
+  ports:
+    - name: http
+      port: 3000
+      targetPort: http
+  selector:
+    app: grafana
+  type: NodePort

From ae1dff7c2f86d6317df379e2af2d4b8415b501f4 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Fri, 5 Feb 2021 16:41:33 +0100
Subject: [PATCH 03/26] Add virtual service for grafana

---
 manager/manifests/grafana/grafana.yaml | 35 ++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/manager/manifests/grafana/grafana.yaml b/manager/manifests/grafana/grafana.yaml
index 18904f258a..bf2843dd3f 100644
--- a/manager/manifests/grafana/grafana.yaml
+++ b/manager/manifests/grafana/grafana.yaml
@@ -95,8 +95,7 @@ spec:
         app: grafana
     spec:
       containers:
-        - env: []
-          image: grafana/grafana:7.4.0
+        - image: grafana/grafana:7.4.0
           name: grafana
           ports:
             - containerPort: 3000
@@ -112,6 +111,11 @@ spec:
             requests:
               cpu: 100m
               memory: 100Mi
+          env:
+            - name: GF_SERVER_ROOT_URL
+              value: "%(protocol)s://%(domain)s:%(http_port)s/dashboard"
+            - name: GF_SERVER_SERVE_FROM_SUB_PATH
+              value: "true"
           volumeMounts:
             - mountPath: /var/lib/grafana
               name: grafana-storage
@@ -160,3 +164,30 @@ spec:
   selector:
     app: grafana
   type: NodePort
+
+---
+
+apiVersion: networking.istio.io/v1beta1
+kind: VirtualService
+metadata:
+  name: grafana
+  namespace: default
+spec:
+  hosts:
+    - "*"
+  gateways:
+    - operator-gateway
+  http:
+    - name: grafana
+      match:
+        - uri:
+            prefix: "/dashboard"
+        - uri:
+            prefix: "/grafana"
+      rewrite:
+        uri: "/dashboard"
+      route:
+        - destination:
+            host: grafana
+            port:
+              number: 3000

From 810fc9e5ff75bf2641a5ae5626179b9c45fdb839 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Fri, 5 Feb 2021 16:42:05 +0100
Subject: [PATCH 04/26] Add missing header to grafana-dashboard.yaml

---
 manager/manifests/grafana/grafana-dashboard.yaml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/manager/manifests/grafana/grafana-dashboard.yaml b/manager/manifests/grafana/grafana-dashboard.yaml
index ae56e9115c..eb8a09b308 100644
--- a/manager/manifests/grafana/grafana-dashboard.yaml
+++ b/manager/manifests/grafana/grafana-dashboard.yaml
@@ -1,3 +1,17 @@
+# Copyright 2021 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 apiVersion: v1
 kind: ConfigMap
 metadata:

From 1154f7c4ed5d1b4bc36524334954d4239494282d Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Sat, 6 Feb 2021 17:48:11 +0100
Subject: [PATCH 05/26] Remove metrics exporters and add grafana installation

---
 CONTRIBUTING.md                               |  4 +-
 build/images.sh                               |  3 +-
 cli/cmd/lib_cluster_config_aws.go             |  8 +--
 docs/clusters/aws/install.md                  |  2 +-
 docs/clusters/gcp/install.md                  |  2 +-
 images/grafana/Dockerfile                     |  1 +
 .../prometheus-stackdriver-sidecar/Dockerfile |  1 -
 images/prometheus-to-cloudwatch/Dockerfile    |  1 -
 manager/install.sh                            |  3 +-
 .../manifests/prometheus-monitoring.yaml.j2   | 18 ------
 .../prometheus-to-cloudwatch.yaml.j2          | 60 -------------------
 pkg/types/clusterconfig/cluster_config_aws.go | 26 ++------
 pkg/types/clusterconfig/cluster_config_gcp.go | 36 +++++------
 pkg/types/clusterconfig/config_key.go         |  6 +-
 14 files changed, 38 insertions(+), 133 deletions(-)
 create mode 100644 images/grafana/Dockerfile
 delete mode 100644 images/prometheus-stackdriver-sidecar/Dockerfile
 delete mode 100644 images/prometheus-to-cloudwatch/Dockerfile
 delete mode 100644 manager/manifests/prometheus-to-cloudwatch.yaml.j2

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 939dbf0a79..37f09067d8 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -193,7 +193,7 @@ image_prometheus: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/prometh
 image_prometheus_config_reloader: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/prometheus-config-reloader:latest
 image_prometheus_operator: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/prometheus-operator:latest
 image_prometheus_statsd_exporter: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/prometheus-statsd-exporter:latest
-image_prometheus_to_cloudwatch: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/prometheus-to-cloudwatch:latest
+image_grafana: <account_id>.dkr.ecr.<region>.amazonaws.com/cortexlabs/grafana:latest
 ```
 
 Create `dev/config/cluster-gcp.yaml`. Paste the following config, and update `project`, `zone`, and all registry URLs (replace `<project_id>` with your project ID, and update `gcr.io` if you are using a different host):
@@ -220,7 +220,7 @@ image_prometheus: gcr.io/<project_id>/cortexlabs/prometheus:latest
 image_prometheus_config_reloader: gcr.io/<project_id>/cortexlabs/prometheus-config-reloader:latest
 image_prometheus_operator: gcr.io/<project_id>/cortexlabs/prometheus-operator:latest
 image_prometheus_statsd_exporter: gcr.io/<project_id>/cortexlabs/prometheus-statsd-exporter:latest
-image_prometheus_stackdriver_sidecar: gcr.io/<project_id>/cortexlabs/prometheus-stackdriver-sidecar:latest
+image_grafana: gcr.io/<project_id>/cortexlabs/grafana:latest
 ```
 
 ### Building
diff --git a/build/images.sh b/build/images.sh
index 22baf3393e..9515f73b84 100644
--- a/build/images.sh
+++ b/build/images.sh
@@ -73,6 +73,7 @@ non_dev_images_cluster=(
   "prometheus-config-reloader"
   "prometheus-operator"
   "prometheus-statsd-exporter"
+  "grafana"
 )
 non_dev_images_aws=(
   # includes non_dev_images_cluster
@@ -81,12 +82,10 @@ non_dev_images_aws=(
   "inferentia"
   "neuron-rtd"
   "nvidia"
-  "prometheus-to-cloudwatch"
 )
 non_dev_images_gcp=(
   # includes non_dev_images_cluster
   "google-pause"
-  "prometheus-stackdriver-sidecar"
 )
 
 all_images=(
diff --git a/cli/cmd/lib_cluster_config_aws.go b/cli/cmd/lib_cluster_config_aws.go
index c00c409857..75214073ef 100644
--- a/cli/cmd/lib_cluster_config_aws.go
+++ b/cli/cmd/lib_cluster_config_aws.go
@@ -414,8 +414,8 @@ func setConfigFieldsFromCached(userClusterConfig *clusterconfig.Config, cachedCl
 		return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImagePrometheusStatsDExporterKey, cachedClusterConfig.ImagePrometheusStatsDExporter)
 	}
 
-	if s.Obj(cachedClusterConfig.ImagePrometheusToCloudWatch) != s.Obj(userClusterConfig.ImagePrometheusToCloudWatch) {
-		return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImagePrometheusToCloudwatchKey, cachedClusterConfig.ImagePrometheusToCloudWatch)
+	if s.Obj(cachedClusterConfig.ImageGrafana) != s.Obj(userClusterConfig.ImageGrafana) {
+		return clusterconfig.ErrorConfigCannotBeChangedOnUpdate(clusterconfig.ImageGrafanaKey, cachedClusterConfig.ImageGrafana)
 	}
 
 	if userClusterConfig.Spot != nil && *userClusterConfig.Spot != *cachedClusterConfig.Spot {
@@ -729,8 +729,8 @@ func clusterConfigConfirmationStr(clusterConfig clusterconfig.Config, awsCreds A
 	if clusterConfig.ImagePrometheusStatsDExporter != defaultConfig.ImagePrometheusStatsDExporter {
 		items.Add(clusterconfig.ImagePrometheusStatsDExporterUserKey, clusterConfig.ImagePrometheusStatsDExporter)
 	}
-	if clusterConfig.ImagePrometheusToCloudWatch != defaultConfig.ImagePrometheusToCloudWatch {
-		items.Add(clusterconfig.ImagePrometheusToCloudwatchUserKey, clusterConfig.ImagePrometheusToCloudWatch)
+	if clusterConfig.ImageGrafana != defaultConfig.ImageGrafana {
+		items.Add(clusterconfig.ImageGrafanaUserKey, clusterConfig.ImageGrafana)
 	}
 	return items.String()
 }
diff --git a/docs/clusters/aws/install.md b/docs/clusters/aws/install.md
index 81f9c92df3..c080cbddfe 100644
--- a/docs/clusters/aws/install.md
+++ b/docs/clusters/aws/install.md
@@ -105,5 +105,5 @@ image_prometheus: quay.io/cortexlabs/prometheus:master
 image_prometheus_config_reloader: quay.io/cortexlabs/prometheus-config-reloader:master
 image_prometheus_operator: quay.io/cortexlabs/prometheus-operator:master
 image_prometheus_statsd_exporter: quay.io/cortexlabs/prometheus-statsd-exporter:master
-image_prometheus_to_cloudwatch: quay.io/cortexlabs/prometheus-to-cloudwatch:master
+image_grafana: quay.io/cortexlabs/grafana:master
 ```
diff --git a/docs/clusters/gcp/install.md b/docs/clusters/gcp/install.md
index 480513c057..966270617b 100644
--- a/docs/clusters/gcp/install.md
+++ b/docs/clusters/gcp/install.md
@@ -81,5 +81,5 @@ image_prometheus: quay.io/cortexlabs/prometheus:master
 image_prometheus_config_reloader: quay.io/cortexlabs/prometheus-config-reloader:master
 image_prometheus_operator: quay.io/cortexlabs/prometheus-operator:master
 image_prometheus_statsd_exporter: quay.io/cortexlabs/prometheus-statsd-exporter:master
-image_prometheus_stackdriver_sidecar: quay.io/cortexlabs/prometheus-stackdriver-sidecar:master
+image_grafana: quay.io/cortexlabs/grafana:master
 ```
diff --git a/images/grafana/Dockerfile b/images/grafana/Dockerfile
new file mode 100644
index 0000000000..50f9cb3064
--- /dev/null
+++ b/images/grafana/Dockerfile
@@ -0,0 +1 @@
+FROM grafana/grafana:7.4.0
diff --git a/images/prometheus-stackdriver-sidecar/Dockerfile b/images/prometheus-stackdriver-sidecar/Dockerfile
deleted file mode 100644
index d5f3a5eb17..0000000000
--- a/images/prometheus-stackdriver-sidecar/Dockerfile
+++ /dev/null
@@ -1 +0,0 @@
-FROM gcr.io/stackdriver-prometheus/stackdriver-prometheus-sidecar:0.8.1
diff --git a/images/prometheus-to-cloudwatch/Dockerfile b/images/prometheus-to-cloudwatch/Dockerfile
deleted file mode 100644
index a6a56c9cc7..0000000000
--- a/images/prometheus-to-cloudwatch/Dockerfile
+++ /dev/null
@@ -1 +0,0 @@
-FROM cloudposse/prometheus-to-cloudwatch:0.14.0
diff --git a/manager/install.sh b/manager/install.sh
index d3d38b93c2..762d1647d7 100755
--- a/manager/install.sh
+++ b/manager/install.sh
@@ -68,7 +68,7 @@ function cluster_up_aws() {
   echo -n "￮ configuring metrics "
   envsubst < manifests/metrics-server.yaml | kubectl apply -f - >/dev/null
   setup_prometheus
-  python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-to-cloudwatch.yaml.j2 | kubectl apply -f - >/dev/null
+  kubectl apply -f /workspace/grafana/ >/dev/null
   echo "✓"
 
   if [[ "$CORTEX_INSTANCE_TYPE" == p* ]] || [[ "$CORTEX_INSTANCE_TYPE" == g* ]]; then
@@ -126,6 +126,7 @@ function cluster_up_gcp() {
 
   echo -n "￮ configuring metrics "
   setup_prometheus
+  kubectl apply -f /workspace/grafana/ >/dev/null
   echo "✓"
 
   if [ -n "$CORTEX_ACCELERATOR_TYPE" ]; then
diff --git a/manager/manifests/prometheus-monitoring.yaml.j2 b/manager/manifests/prometheus-monitoring.yaml.j2
index 20ce3fb25e..ef578ca26c 100644
--- a/manager/manifests/prometheus-monitoring.yaml.j2
+++ b/manager/manifests/prometheus-monitoring.yaml.j2
@@ -58,24 +58,6 @@ spec:
     fsGroup: 2000
     runAsNonRoot: true
     runAsUser: 1000
-  {% if config["provider"] == "gcp" %}
-  containers:
-    - name: stackdriver-sidecar
-      image: {{ config["image_prometheus_stackdriver_sidecar"] }}
-      imagePullPolicy: Always
-      args:
-      - --stackdriver.project-id={{ config["project"] }}
-      - --prometheus.wal-directory=/data/prometheus-db/wal
-      - --stackdriver.kubernetes.location={{ config["zone"] }}
-      - --stackdriver.kubernetes.cluster-name={{ config["cluster_name"] }}
-      - --include={job=~"default/.*",__name__=~"cortex.*"}
-      ports:
-      - name: sidecar
-        containerPort: 9091
-      volumeMounts:
-      - mountPath: /data
-        name: prometheus-prometheus-db
-  {% endif %}
 ---
 
 apiVersion: v1
diff --git a/manager/manifests/prometheus-to-cloudwatch.yaml.j2 b/manager/manifests/prometheus-to-cloudwatch.yaml.j2
deleted file mode 100644
index 7f711fb1dd..0000000000
--- a/manager/manifests/prometheus-to-cloudwatch.yaml.j2
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright 2021 Cortex Labs, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-{% if config['provider'] == "aws" %}
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: prometheus-cwconfig
-  namespace: default
-data:
-  CLOUDWATCH_NAMESPACE: "cortex"
-  CLOUDWATCH_REGION: "{{ config["region"] }}"
-  CLOUDWATCH_PUBLISH_TIMEOUT: "10"
-  PROMETHEUS_SCRAPE_INTERVAL: "15"
-  PROMETHEUS_SCRAPE_URL: http://prometheus.default:9090/federate?match[]={job=~"default/.*",__name__=~"cortex.*"}
-  INCLUDE_METRICS: cortex_*
-  EXCLUDE_DIMENSIONS_FOR_METRICS: cortex_*=container,endpoint,instance,job,namespace,pod,prometheus,prometheus_replica
-
----
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: prometheus-to-cloudwatch
-  namespace: default
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: prometheus-to-cloudwatch
-  template:
-    metadata:
-      labels:
-        app: prometheus-to-cloudwatch
-    spec:
-      containers:
-        - name: prometheus-to-cloudwatch
-          image: {{ config["image_prometheus_to_cloudwatch"] }}
-          imagePullPolicy: Always
-          envFrom:
-            - configMapRef:
-                name: prometheus-cwconfig
-            - secretRef:
-                name: aws-credentials
-          resources:
-            requests:
-              cpu: 100m
-              memory: 150Mi
-{% endif %}
diff --git a/pkg/types/clusterconfig/cluster_config_aws.go b/pkg/types/clusterconfig/cluster_config_aws.go
index 33395223c7..d1698df5f9 100644
--- a/pkg/types/clusterconfig/cluster_config_aws.go
+++ b/pkg/types/clusterconfig/cluster_config_aws.go
@@ -77,7 +77,7 @@ type CoreConfig struct {
 	ImagePrometheusConfigReloader string `json:"image_prometheus_config_reloader" yaml:"image_prometheus_config_reloader"`
 	ImagePrometheusOperator       string `json:"image_prometheus_operator" yaml:"image_prometheus_operator"`
 	ImagePrometheusStatsDExporter string `json:"image_prometheus_statsd_exporter" yaml:"image_prometheus_statsd_exporter"`
-	ImagePrometheusToCloudWatch   string `json:"image_prometheus_to_cloudwatch" yaml:"image_prometheus_to_cloudwatch"`
+	ImageGrafana                  string `json:"image_grafana" yaml:"image_grafana"`
 }
 
 type ManagedConfig struct {
@@ -298,20 +298,6 @@ var CoreConfigStructFieldValidations = []*cr.StructFieldValidation{
 			Validator: validateImageVersion,
 		},
 	},
-	{
-		StructField: "ImageIstioProxy",
-		StringValidation: &cr.StringValidation{
-			Default:   "quay.io/cortexlabs/istio-proxy:" + consts.CortexVersion,
-			Validator: validateImageVersion,
-		},
-	},
-	{
-		StructField: "ImageIstioPilot",
-		StringValidation: &cr.StringValidation{
-			Default:   "quay.io/cortexlabs/istio-pilot:" + consts.CortexVersion,
-			Validator: validateImageVersion,
-		},
-	},
 	{
 		StructField: "ImagePrometheus",
 		StringValidation: &cr.StringValidation{
@@ -341,9 +327,9 @@ var CoreConfigStructFieldValidations = []*cr.StructFieldValidation{
 		},
 	},
 	{
-		StructField: "ImagePrometheusToCloudWatch",
+		StructField: "ImageGrafana",
 		StringValidation: &cr.StringValidation{
-			Default:   "quay.io/cortexlabs/prometheus-to-cloudwatch:" + consts.CortexVersion,
+			Default:   "quay.io/cortexlabs/grafana:" + consts.CortexVersion,
 			Validator: validateImageVersion,
 		},
 	},
@@ -1262,7 +1248,7 @@ func (cc *CoreConfig) UserTable() table.KeyValuePairs {
 	items.Add(ImagePrometheusConfigReloaderUserKey, cc.ImagePrometheusConfigReloader)
 	items.Add(ImagePrometheusOperatorUserKey, cc.ImagePrometheusOperator)
 	items.Add(ImagePrometheusStatsDExporterUserKey, cc.ImagePrometheusStatsDExporter)
-	items.Add(ImagePrometheusToCloudwatchUserKey, cc.ImagePrometheusToCloudWatch)
+	items.Add(ImageGrafanaUserKey, cc.ImageGrafana)
 
 	return items
 }
@@ -1391,8 +1377,8 @@ func (cc *CoreConfig) TelemetryEvent() map[string]interface{} {
 	if strings.HasPrefix(cc.ImagePrometheusStatsDExporter, "cortexlabs/") {
 		event["image_prometheus_statsd_exporter._is_custom"] = true
 	}
-	if strings.HasPrefix(cc.ImagePrometheusToCloudWatch, "cortexlabs/") {
-		event["image_prometheus_to_cloudwatch._is_custom"] = true
+	if strings.HasPrefix(cc.ImageGrafana, "cortexlabs/") {
+		event["image_grafana._is_custom"] = true
 	}
 
 	return event
diff --git a/pkg/types/clusterconfig/cluster_config_gcp.go b/pkg/types/clusterconfig/cluster_config_gcp.go
index 6348d7260e..565910fe96 100644
--- a/pkg/types/clusterconfig/cluster_config_gcp.go
+++ b/pkg/types/clusterconfig/cluster_config_gcp.go
@@ -44,19 +44,19 @@ type GCPCoreConfig struct {
 	IsManaged      bool               `json:"is_managed" yaml:"is_managed"`
 	Bucket         string             `json:"bucket" yaml:"bucket"`
 
-	ImageOperator                     string `json:"image_operator" yaml:"image_operator"`
-	ImageManager                      string `json:"image_manager" yaml:"image_manager"`
-	ImageDownloader                   string `json:"image_downloader" yaml:"image_downloader"`
-	ImageClusterAutoscaler            string `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"`
-	ImageFluentBit                    string `json:"image_fluent_bit" yaml:"image_fluent_bit"`
-	ImageIstioProxy                   string `json:"image_istio_proxy" yaml:"image_istio_proxy"`
-	ImageIstioPilot                   string `json:"image_istio_pilot" yaml:"image_istio_pilot"`
-	ImageGooglePause                  string `json:"image_google_pause" yaml:"image_google_pause"`
-	ImagePrometheus                   string `json:"image_prometheus" yaml:"image_prometheus"`
-	ImagePrometheusConfigReloader     string `json:"image_prometheus_config_reloader" yaml:"image_prometheus_config_reloader"`
-	ImagePrometheusOperator           string `json:"image_prometheus_operator" yaml:"image_prometheus_operator"`
-	ImagePrometheusStatsDExporter     string `json:"image_prometheus_statsd_exporter" yaml:"image_prometheus_statsd_exporter"`
-	ImagePrometheusStackdriverSidecar string `json:"image_prometheus_stackdriver_sidecar" yaml:"image_prometheus_stackdriver_sidecar"`
+	ImageOperator                 string `json:"image_operator" yaml:"image_operator"`
+	ImageManager                  string `json:"image_manager" yaml:"image_manager"`
+	ImageDownloader               string `json:"image_downloader" yaml:"image_downloader"`
+	ImageClusterAutoscaler        string `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"`
+	ImageFluentBit                string `json:"image_fluent_bit" yaml:"image_fluent_bit"`
+	ImageIstioProxy               string `json:"image_istio_proxy" yaml:"image_istio_proxy"`
+	ImageIstioPilot               string `json:"image_istio_pilot" yaml:"image_istio_pilot"`
+	ImageGooglePause              string `json:"image_google_pause" yaml:"image_google_pause"`
+	ImagePrometheus               string `json:"image_prometheus" yaml:"image_prometheus"`
+	ImagePrometheusConfigReloader string `json:"image_prometheus_config_reloader" yaml:"image_prometheus_config_reloader"`
+	ImagePrometheusOperator       string `json:"image_prometheus_operator" yaml:"image_prometheus_operator"`
+	ImagePrometheusStatsDExporter string `json:"image_prometheus_statsd_exporter" yaml:"image_prometheus_statsd_exporter"`
+	ImageGrafana                  string `json:"image_grafana" yaml:"image_grafana"`
 }
 
 type GCPManagedConfig struct {
@@ -231,9 +231,9 @@ var GCPCoreConfigStructFieldValidations = []*cr.StructFieldValidation{
 		},
 	},
 	{
-		StructField: "ImagePrometheusStackdriverSidecar",
+		StructField: "ImageGrafana",
 		StringValidation: &cr.StringValidation{
-			Default:   "quay.io/cortexlabs/prometheus-stackdriver-sidecar:" + consts.CortexVersion,
+			Default:   "quay.io/cortexlabs/grafana:" + consts.CortexVersion,
 			Validator: validateImageVersion,
 		},
 	},
@@ -664,7 +664,7 @@ func (cc *GCPCoreConfig) UserTable() table.KeyValuePairs {
 	items.Add(ImagePrometheusConfigReloaderUserKey, cc.ImagePrometheusConfigReloader)
 	items.Add(ImagePrometheusOperatorUserKey, cc.ImagePrometheusOperator)
 	items.Add(ImagePrometheusStatsDExporterUserKey, cc.ImagePrometheusStatsDExporter)
-	items.Add(ImagePrometheusStackdriverSidecarUserKey, cc.ImagePrometheusStackdriverSidecar)
+	items.Add(ImageGrafanaUserKey, cc.ImageGrafana)
 
 	return items
 }
@@ -766,8 +766,8 @@ func (cc *GCPCoreConfig) TelemetryEvent() map[string]interface{} {
 	if strings.HasPrefix(cc.ImagePrometheusStatsDExporter, "cortexlabs/") {
 		event["image_prometheus_statsd_exporter._is_custom"] = true
 	}
-	if strings.HasPrefix(cc.ImagePrometheusStackdriverSidecar, "cortexlabs/") {
-		event["image_prometheus_stackdriver_sidecar._is_custom"] = true
+	if strings.HasPrefix(cc.ImageGrafana, "cortexlabs/") {
+		event["image_grafana._is_custom"] = true
 	}
 	return event
 }
diff --git a/pkg/types/clusterconfig/config_key.go b/pkg/types/clusterconfig/config_key.go
index 0e10e3c680..7bd06a1ecd 100644
--- a/pkg/types/clusterconfig/config_key.go
+++ b/pkg/types/clusterconfig/config_key.go
@@ -72,8 +72,7 @@ const (
 	ImagePrometheusConfigReloaderKey       = "image_prometheus_config_reloader"
 	ImagePrometheusOperatorKey             = "image_prometheus_operator"
 	ImagePrometheusStatsDExporterKey       = "image_prometheus_statsd_exporter"
-	ImagePrometheusToCloudwatchKey         = "image_prometheus_to_cloudwatch"
-	ImagePrometheusStackdriverSidecarKey   = "image_prometheus_stackdriver_sidecar"
+	ImageGrafanaKey                        = "image_grafana"
 
 	// User facing string
 	ProviderUserKey                            = "provider"
@@ -130,6 +129,5 @@ const (
 	ImagePrometheusConfigReloaderUserKey       = "prometheus config reloader image"
 	ImagePrometheusOperatorUserKey             = "prometheus operator image"
 	ImagePrometheusStatsDExporterUserKey       = "prometheus statsd exporter image"
-	ImagePrometheusToCloudwatchUserKey         = "prometheus to cloudwatch image"
-	ImagePrometheusStackdriverSidecarUserKey   = "prometheus stackdriver sidecar image"
+	ImageGrafanaUserKey                        = "grafana image"
 )

From a98fc6c063984db29fd8212c3012d4b3f478eb39 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Mon, 8 Feb 2021 13:09:59 +0100
Subject: [PATCH 06/26] Fix grafana installation

---
 manager/install.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/manager/install.sh b/manager/install.sh
index 762d1647d7..0590c3ef57 100755
--- a/manager/install.sh
+++ b/manager/install.sh
@@ -68,7 +68,7 @@ function cluster_up_aws() {
   echo -n "￮ configuring metrics "
   envsubst < manifests/metrics-server.yaml | kubectl apply -f - >/dev/null
   setup_prometheus
-  kubectl apply -f /workspace/grafana/ >/dev/null
+  kubectl apply -f manifests/grafana/ >/dev/null
   echo "✓"
 
   if [[ "$CORTEX_INSTANCE_TYPE" == p* ]] || [[ "$CORTEX_INSTANCE_TYPE" == g* ]]; then

From 0275c408c23a5caecbd2b1736b04bbbf234eb718 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Mon, 8 Feb 2021 13:11:24 +0100
Subject: [PATCH 07/26] Add dashboard for batch apis

---
 .../grafana/grafana-dashboard-batch.yaml      | 374 ++++++++++++++++++
 ...d.yaml => grafana-dashboard-realtime.yaml} | 224 ++++++++++-
 manager/manifests/grafana/grafana.yaml        |  25 +-
 3 files changed, 599 insertions(+), 24 deletions(-)
 create mode 100644 manager/manifests/grafana/grafana-dashboard-batch.yaml
 rename manager/manifests/grafana/{grafana-dashboard.yaml => grafana-dashboard-realtime.yaml} (82%)

diff --git a/manager/manifests/grafana/grafana-dashboard-batch.yaml b/manager/manifests/grafana/grafana-dashboard-batch.yaml
new file mode 100644
index 0000000000..902fdd5671
--- /dev/null
+++ b/manager/manifests/grafana/grafana-dashboard-batch.yaml
@@ -0,0 +1,374 @@
+# Copyright 2021 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-batch
+  namespace: default
+data:
+  batch.json: |-
+    {
+      "annotations": {
+        "list": [
+          {
+            "builtIn": 1,
+            "datasource": "prometheus",
+            "enable": true,
+            "hide": true,
+            "iconColor": "rgba(0, 211, 255, 1)",
+            "name": "Annotations & Alerts",
+            "type": "dashboard"
+          }
+        ]
+      },
+      "editable": true,
+      "gnetId": null,
+      "graphTooltip": 0,
+      "id": 3,
+      "links": [],
+      "panels": [
+        {
+          "datasource": null,
+          "fieldConfig": {
+            "defaults": {
+              "custom": {}
+            },
+            "overrides": []
+          },
+          "gridPos": {
+            "h": 2,
+            "w": 24,
+            "x": 0,
+            "y": 0
+          },
+          "id": 7,
+          "options": {
+            "content": "<h1 style=\"text-align: center\">BatchAPI</h1>\n",
+            "mode": "markdown"
+          },
+          "pluginVersion": "7.4.0",
+          "timeFrom": null,
+          "timeShift": null,
+          "title": "",
+          "transparent": true,
+          "type": "text"
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Number of succeeded batches per API, for all API IDs",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {}
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 0,
+            "y": 2
+          },
+          "hiddenSeries": false,
+          "id": 2,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.0",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(cortex_batch_succeeded) by (api_name)",
+              "interval": "",
+              "legendFormat": "{{api_name}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "# Succeeded Batches",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": 0,
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Number of failed batches per API, for all API IDs",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {}
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 9,
+            "w": 12,
+            "x": 12,
+            "y": 2
+          },
+          "hiddenSeries": false,
+          "id": 3,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.0",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(cortex_batch_failed) by (api_name)",
+              "interval": "",
+              "legendFormat": "{{api_name}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "# Failed Batches",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": 0,
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Average time per batch per APIs, for all API IDs",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {}
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 6,
+            "y": 11
+          },
+          "hiddenSeries": false,
+          "id": 5,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.0",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum by (api_name) (rate(cortex_time_per_batch_sum[1h])) / sum by (api_name) (rate(cortex_time_per_batch_count[1h]))",
+              "interval": "",
+              "legendFormat": "{{api_name}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Average Time per Batch",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "format": "ms",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        }
+      ],
+      "schemaVersion": 27,
+      "style": "dark",
+      "tags": [],
+      "templating": {
+        "list": []
+      },
+      "time": {
+        "from": "now-1h",
+        "to": "now"
+      },
+      "timepicker": {},
+      "timezone": "",
+      "title": "BatchAPI",
+      "uid": "nEiYFWEMk",
+      "version": 3
+    }
\ No newline at end of file
diff --git a/manager/manifests/grafana/grafana-dashboard.yaml b/manager/manifests/grafana/grafana-dashboard-realtime.yaml
similarity index 82%
rename from manager/manifests/grafana/grafana-dashboard.yaml
rename to manager/manifests/grafana/grafana-dashboard-realtime.yaml
index eb8a09b308..64250f2cca 100644
--- a/manager/manifests/grafana/grafana-dashboard.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-realtime.yaml
@@ -15,10 +15,10 @@
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: grafana-dashboard-cortex
+  name: grafana-dashboard-realtime
   namespace: default
 data:
-  cortex.json: |-
+  realtime.json: |-
     {
       "annotations": {
         "list": [
@@ -36,7 +36,7 @@ data:
       "editable": true,
       "gnetId": null,
       "graphTooltip": 0,
-      "id": 2,
+      "id": 8,
       "links": [],
       "panels": [
         {
@@ -48,20 +48,20 @@ data:
             "overrides": []
           },
           "gridPos": {
-            "h": 3,
+            "h": 2,
             "w": 24,
             "x": 0,
             "y": 0
           },
           "id": 12,
           "options": {
-            "content": "<h1 style=\"text-align: center\">Cortex Dashboard</h1>",
+            "content": "<h1 style=\"text-align: center\">RealtimeAPI</h1>",
             "mode": "html"
           },
           "pluginVersion": "7.4.0",
           "timeFrom": null,
           "timeShift": null,
-          "title": "Cortex Dashboard",
+          "title": "",
           "transparent": true,
           "type": "text"
         },
@@ -85,7 +85,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 3
+            "y": 2
           },
           "hiddenSeries": false,
           "id": 5,
@@ -191,7 +191,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 3
+            "y": 2
           },
           "hiddenSeries": false,
           "id": 2,
@@ -297,7 +297,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 0,
-            "y": 12
+            "y": 11
           },
           "hiddenSeries": false,
           "id": 3,
@@ -403,7 +403,7 @@ data:
             "h": 9,
             "w": 12,
             "x": 12,
-            "y": 12
+            "y": 11
           },
           "hiddenSeries": false,
           "id": 4,
@@ -509,7 +509,7 @@ data:
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 21
+            "y": 20
           },
           "hiddenSeries": false,
           "id": 7,
@@ -615,7 +615,7 @@ data:
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 21
+            "y": 20
           },
           "hiddenSeries": false,
           "id": 9,
@@ -721,7 +721,7 @@ data:
             "h": 8,
             "w": 12,
             "x": 0,
-            "y": 29
+            "y": 28
           },
           "hiddenSeries": false,
           "id": 8,
@@ -827,7 +827,7 @@ data:
             "h": 8,
             "w": 12,
             "x": 12,
-            "y": 29
+            "y": 28
           },
           "hiddenSeries": false,
           "id": 10,
@@ -912,6 +912,198 @@ data:
             "align": false,
             "alignLevel": null
           }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Number of in-flight requests per API, for all API IDs",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {}
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 0,
+            "y": 36
+          },
+          "hiddenSeries": false,
+          "id": 14,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.0",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "sum(cortex_in_flight_requests) by (api_name)",
+              "interval": "",
+              "legendFormat": "api-{{api_name}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeRegions": [],
+          "title": "In-Flight Requests",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": 0,
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
+        },
+        {
+          "aliasColors": {},
+          "bars": false,
+          "dashLength": 10,
+          "dashes": false,
+          "datasource": null,
+          "description": "Number of replicas per API, for all API IDs",
+          "fieldConfig": {
+            "defaults": {
+              "custom": {}
+            },
+            "overrides": []
+          },
+          "fill": 1,
+          "fillGradient": 0,
+          "gridPos": {
+            "h": 8,
+            "w": 12,
+            "x": 12,
+            "y": 36
+          },
+          "hiddenSeries": false,
+          "id": 15,
+          "legend": {
+            "avg": false,
+            "current": false,
+            "max": false,
+            "min": false,
+            "show": true,
+            "total": false,
+            "values": false
+          },
+          "lines": true,
+          "linewidth": 1,
+          "nullPointMode": "null",
+          "options": {
+            "alertThreshold": true
+          },
+          "percentage": false,
+          "pluginVersion": "7.4.0",
+          "pointradius": 2,
+          "points": false,
+          "renderer": "flot",
+          "seriesOverrides": [],
+          "spaceLength": 10,
+          "stack": false,
+          "steppedLine": false,
+          "targets": [
+            {
+              "expr": "count(cortex_in_flight_requests) by (api_name)",
+              "interval": "",
+              "legendFormat": "api-{{api_name}}",
+              "refId": "A"
+            }
+          ],
+          "thresholds": [],
+          "timeFrom": null,
+          "timeRegions": [],
+          "timeShift": null,
+          "title": "Replicas",
+          "tooltip": {
+            "shared": true,
+            "sort": 0,
+            "value_type": "individual"
+          },
+          "type": "graph",
+          "xaxis": {
+            "buckets": null,
+            "mode": "time",
+            "name": null,
+            "show": true,
+            "values": []
+          },
+          "yaxes": [
+            {
+              "decimals": 0,
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": "0",
+              "show": true
+            },
+            {
+              "format": "short",
+              "label": null,
+              "logBase": 1,
+              "max": null,
+              "min": null,
+              "show": true
+            }
+          ],
+          "yaxis": {
+            "align": false,
+            "alignLevel": null
+          }
         }
       ],
       "refresh": "30s",
@@ -927,7 +1119,7 @@ data:
       },
       "timepicker": {},
       "timezone": "",
-      "title": "Cortex",
+      "title": "RealtimeAPI",
       "uid": "ZQv3auYGk",
       "version": 1
-    }
+    }
\ No newline at end of file
diff --git a/manager/manifests/grafana/grafana.yaml b/manager/manifests/grafana/grafana.yaml
index bf2843dd3f..9230dc6167 100644
--- a/manager/manifests/grafana/grafana.yaml
+++ b/manager/manifests/grafana/grafana.yaml
@@ -29,7 +29,8 @@ data:
                 "orgId": 1,
                 "type": "prometheus",
                 "url": "http://prometheus.default:9090",
-                "version": 1
+                "version": 1,
+                "isDefault": true
             }
         ]
     }
@@ -47,10 +48,10 @@ data:
         "apiVersion": 1,
         "providers": [
             {
-                "folder": "Default",
-                "name": "0",
+                "folder": "Cortex",
+                "name": "Cortex",
                 "options": {
-                    "path": "/grafana-dashboard-definitions/0"
+                    "path": "/grafana-dashboard-definitions/cortex"
                 },
                 "disableDeletion": true,
                 "orgId": 1,
@@ -116,6 +117,8 @@ spec:
               value: "%(protocol)s://%(domain)s:%(http_port)s/dashboard"
             - name: GF_SERVER_SERVE_FROM_SUB_PATH
               value: "true"
+            - name: GF_USERS_DEFAULT_THEME
+              value: "light"
           volumeMounts:
             - mountPath: /var/lib/grafana
               name: grafana-storage
@@ -126,8 +129,11 @@ spec:
             - mountPath: /etc/grafana/provisioning/dashboards
               name: grafana-dashboards
               readOnly: false
-            - mountPath: /grafana-dashboard-definitions/0/cortex
-              name: grafana-dashboard-cortex
+            - mountPath: /grafana-dashboard-definitions/cortex/realtime
+              name: grafana-dashboard-realtime
+              readOnly: false
+            - mountPath: /grafana-dashboard-definitions/cortex/batch
+              name: grafana-dashboard-batch
               readOnly: false
       securityContext:
         fsGroup: 65534
@@ -143,9 +149,12 @@ spec:
         - name: grafana-dashboards
           configMap:
             name: grafana-dashboards
-        - name: grafana-dashboard-cortex
+        - name: grafana-dashboard-realtime
+          configMap:
+            name: grafana-dashboard-realtime
+        - name: grafana-dashboard-batch
           configMap:
-            name: grafana-dashboard-cortex
+            name: grafana-dashboard-batch
 
 ---
 

From 4675be50e506b720492d835c4f664624c9026fe7 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Mon, 8 Feb 2021 16:39:13 +0100
Subject: [PATCH 08/26] Refactor dashboards with templating

---
 .../grafana/grafana-dashboard-batch.yaml      |  94 +++-
 .../grafana/grafana-dashboard-realtime.yaml   | 451 +++++++++++-------
 2 files changed, 355 insertions(+), 190 deletions(-)

diff --git a/manager/manifests/grafana/grafana-dashboard-batch.yaml b/manager/manifests/grafana/grafana-dashboard-batch.yaml
index 902fdd5671..3293750bd7 100644
--- a/manager/manifests/grafana/grafana-dashboard-batch.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-batch.yaml
@@ -36,7 +36,6 @@ data:
       "editable": true,
       "gnetId": null,
       "graphTooltip": 0,
-      "id": 3,
       "links": [],
       "panels": [
         {
@@ -71,7 +70,7 @@ data:
           "dashLength": 10,
           "dashes": false,
           "datasource": null,
-          "description": "Number of succeeded batches per API, for all API IDs",
+          "description": "Number of succeeded batches for an API",
           "fieldConfig": {
             "defaults": {
               "custom": {}
@@ -89,13 +88,14 @@ data:
           "hiddenSeries": false,
           "id": 2,
           "legend": {
+            "alignAsTable": true,
             "avg": false,
-            "current": false,
-            "max": false,
-            "min": false,
+            "current": true,
+            "max": true,
+            "min": true,
             "show": true,
             "total": false,
-            "values": false
+            "values": true
           },
           "lines": true,
           "linewidth": 1,
@@ -114,7 +114,7 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(cortex_batch_succeeded) by (api_name)",
+              "expr": "sum(cortex_batch_succeeded{api_name=~\"$api_name\"}) by (api_name)",
               "interval": "",
               "legendFormat": "{{api_name}}",
               "refId": "A"
@@ -140,6 +140,7 @@ data:
           },
           "yaxes": [
             {
+              "$$hashKey": "object:26",
               "decimals": 0,
               "format": "short",
               "label": null,
@@ -149,6 +150,7 @@ data:
               "show": true
             },
             {
+              "$$hashKey": "object:27",
               "format": "short",
               "label": null,
               "logBase": 1,
@@ -168,7 +170,7 @@ data:
           "dashLength": 10,
           "dashes": false,
           "datasource": null,
-          "description": "Number of failed batches per API, for all API IDs",
+          "description": "Number of failed batches for an API",
           "fieldConfig": {
             "defaults": {
               "custom": {}
@@ -186,13 +188,14 @@ data:
           "hiddenSeries": false,
           "id": 3,
           "legend": {
+            "alignAsTable": true,
             "avg": false,
-            "current": false,
-            "max": false,
-            "min": false,
+            "current": true,
+            "max": true,
+            "min": true,
             "show": true,
             "total": false,
-            "values": false
+            "values": true
           },
           "lines": true,
           "linewidth": 1,
@@ -211,7 +214,7 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(cortex_batch_failed) by (api_name)",
+              "expr": "sum(cortex_batch_failed{api_name=~\"$api_name\"}) by (api_name)",
               "interval": "",
               "legendFormat": "{{api_name}}",
               "refId": "A"
@@ -265,7 +268,7 @@ data:
           "dashLength": 10,
           "dashes": false,
           "datasource": null,
-          "description": "Average time per batch per APIs, for all API IDs",
+          "description": "Average time per batch for an API",
           "fieldConfig": {
             "defaults": {
               "custom": {}
@@ -283,13 +286,14 @@ data:
           "hiddenSeries": false,
           "id": 5,
           "legend": {
-            "avg": false,
-            "current": false,
-            "max": false,
-            "min": false,
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
             "show": true,
             "total": false,
-            "values": false
+            "values": true
           },
           "lines": true,
           "linewidth": 1,
@@ -308,7 +312,7 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum by (api_name) (rate(cortex_time_per_batch_sum[1h])) / sum by (api_name) (rate(cortex_time_per_batch_count[1h]))",
+              "expr": "sum(cortex_time_per_batch_sum{api_name=~\"$api_name\"}) by (api_name) / sum(cortex_time_per_batch_count{api_name=~\"$api_name\"}) by (api_name)",
               "interval": "",
               "legendFormat": "{{api_name}}",
               "refId": "A"
@@ -356,11 +360,55 @@ data:
           }
         }
       ],
+      "refresh": "30s",
       "schemaVersion": 27,
       "style": "dark",
       "tags": [],
       "templating": {
-        "list": []
+        "list": [
+          {
+            "allValue": null,
+            "current": {
+              "selected": true,
+              "tags": [],
+              "text": [
+                "image-classifier"
+              ],
+              "value": [
+                "image-classifier"
+              ]
+            },
+            "datasource": null,
+            "definition": "label_values({__name__=~\"cortex_batch_.+\"}, api_name)",
+            "description": null,
+            "error": null,
+            "hide": 0,
+            "includeAll": false,
+            "label": "API Name",
+            "multi": true,
+            "name": "api_name",
+            "options": [
+              {
+                "selected": true,
+                "text": "image-classifier",
+                "value": "image-classifier"
+              }
+            ],
+            "query": {
+              "query": "label_values({__name__=~\"cortex_batch_.+\"}, api_name)",
+              "refId": "StandardVariableQuery"
+            },
+            "refresh": 0,
+            "regex": "",
+            "skipUrlSync": false,
+            "sort": 0,
+            "tagValuesQuery": "",
+            "tags": [],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+          }
+        ]
       },
       "time": {
         "from": "now-1h",
@@ -370,5 +418,5 @@ data:
       "timezone": "",
       "title": "BatchAPI",
       "uid": "nEiYFWEMk",
-      "version": 3
-    }
\ No newline at end of file
+      "version": 1
+    }
diff --git a/manager/manifests/grafana/grafana-dashboard-realtime.yaml b/manager/manifests/grafana/grafana-dashboard-realtime.yaml
index 64250f2cca..3d98b4d67a 100644
--- a/manager/manifests/grafana/grafana-dashboard-realtime.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-realtime.yaml
@@ -36,11 +36,12 @@ data:
       "editable": true,
       "gnetId": null,
       "graphTooltip": 0,
-      "id": 8,
+      "id": 10,
+      "iteration": 1612793050833,
       "links": [],
       "panels": [
         {
-          "datasource": "prometheus",
+          "datasource": null,
           "fieldConfig": {
             "defaults": {
               "custom": {}
@@ -53,10 +54,10 @@ data:
             "x": 0,
             "y": 0
           },
-          "id": 12,
+          "id": 15,
           "options": {
             "content": "<h1 style=\"text-align: center\">RealtimeAPI</h1>",
-            "mode": "html"
+            "mode": "markdown"
           },
           "pluginVersion": "7.4.0",
           "timeFrom": null,
@@ -70,12 +71,11 @@ data:
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "prometheus",
-          "description": "Rate of all responses by API, for all API IDs",
+          "datasource": null,
+          "description": "Request rate, computed over every minute, of an API",
           "fieldConfig": {
             "defaults": {
-              "custom": {},
-              "unit": "reqps"
+              "custom": {}
             },
             "overrides": []
           },
@@ -88,15 +88,17 @@ data:
             "y": 2
           },
           "hiddenSeries": false,
-          "id": 5,
+          "id": 2,
           "legend": {
-            "avg": false,
-            "current": false,
-            "max": false,
-            "min": false,
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
+            "rightSide": false,
             "show": true,
             "total": false,
-            "values": false
+            "values": true
           },
           "lines": true,
           "linewidth": 1,
@@ -115,10 +117,10 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(rate(istio_requests_total{destination_service_name=~\"api-.*\", response_code=~\"2.*\"}[5m])) by (destination_service_name, response_code)",
+              "expr": "sum (rate(istio_requests_total{destination_service_name=~\"api-$api_name.+\"}[1m])) by (destination_service_name)",
               "interval": "",
               "legendFormat": "{{destination_service_name}}",
-              "refId": "A"
+              "refId": "2XX"
             }
           ],
           "thresholds": [],
@@ -138,6 +140,13 @@ data:
                 "regex": "([^\\.]+)\\..+",
                 "renamePattern": "$1"
               }
+            },
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "api-(.*)",
+                "renamePattern": "$1"
+              }
             }
           ],
           "type": "graph",
@@ -176,12 +185,11 @@ data:
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "prometheus",
-          "description": "Rate of 2XX status codes returned by API, for all API IDs",
+          "datasource": null,
+          "description": "Active in-flight requests for an API.\n\nNote: In-flight requests are recorded every 10 seconds, which will correspond to the minimum resolution.",
           "fieldConfig": {
             "defaults": {
-              "custom": {},
-              "unit": "reqps"
+              "custom": {}
             },
             "overrides": []
           },
@@ -194,15 +202,16 @@ data:
             "y": 2
           },
           "hiddenSeries": false,
-          "id": 2,
+          "id": 4,
           "legend": {
-            "avg": false,
-            "current": false,
-            "max": false,
-            "min": false,
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
             "show": true,
             "total": false,
-            "values": false
+            "values": true
           },
           "lines": true,
           "linewidth": 1,
@@ -221,9 +230,9 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(rate(istio_requests_total{destination_service_name=~\"api-.*\", response_code=~\"2.*\"}[5m])) by (destination_service_name, response_code)",
+              "expr": "sum(cortex_in_flight_requests{api_name=~\"$api_name\"}) by (api_name)",
               "interval": "",
-              "legendFormat": "{{destination_service_name}}",
+              "legendFormat": "{{api_name}}",
               "refId": "A"
             }
           ],
@@ -231,21 +240,12 @@ data:
           "timeFrom": null,
           "timeRegions": [],
           "timeShift": null,
-          "title": "2XX Responses",
+          "title": "In-Flight Requests",
           "tooltip": {
             "shared": true,
             "sort": 0,
             "value_type": "individual"
           },
-          "transformations": [
-            {
-              "id": "renameByRegex",
-              "options": {
-                "regex": "([^\\.]+)\\..+",
-                "renamePattern": "$1"
-              }
-            }
-          ],
           "type": "graph",
           "xaxis": {
             "buckets": null,
@@ -256,7 +256,8 @@ data:
           },
           "yaxes": [
             {
-              "format": "reqps",
+              "decimals": 0,
+              "format": "short",
               "label": null,
               "logBase": 1,
               "max": null,
@@ -282,12 +283,11 @@ data:
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "prometheus",
-          "description": "Rate of 4XX status codes returned by API, for all API IDs",
+          "datasource": null,
+          "description": "Request rate, computed over every minute, for responses with status code 2XX of an API",
           "fieldConfig": {
             "defaults": {
-              "custom": {},
-              "unit": "reqps"
+              "custom": {}
             },
             "overrides": []
           },
@@ -300,15 +300,17 @@ data:
             "y": 11
           },
           "hiddenSeries": false,
-          "id": 3,
+          "id": 8,
           "legend": {
-            "avg": false,
-            "current": false,
-            "max": false,
-            "min": false,
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
+            "rightSide": false,
             "show": true,
             "total": false,
-            "values": false
+            "values": true
           },
           "lines": true,
           "linewidth": 1,
@@ -327,17 +329,17 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(rate(istio_requests_total{destination_service_name=~\"api-.*\", response_code=~\"4.*\"}[5m])) by (destination_service_name, response_code)",
+              "expr": "sum(rate(istio_requests_total{destination_service_name=~\"api-$api_name.+\", response_code=~\"2.*\"}[1m])) by (destination_service_name, response_code)",
               "interval": "",
               "legendFormat": "{{destination_service_name}}",
-              "refId": "A"
+              "refId": "2XX"
             }
           ],
           "thresholds": [],
           "timeFrom": null,
           "timeRegions": [],
           "timeShift": null,
-          "title": "4XX Responses",
+          "title": "2XX Responses",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -350,6 +352,13 @@ data:
                 "regex": "([^\\.]+)\\..+",
                 "renamePattern": "$1"
               }
+            },
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "api-(.*)",
+                "renamePattern": "$1"
+              }
             }
           ],
           "type": "graph",
@@ -388,12 +397,10 @@ data:
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "prometheus",
-          "description": "Rate of 5XX status codes returned by API, for all API IDs",
+          "datasource": null,
           "fieldConfig": {
             "defaults": {
-              "custom": {},
-              "unit": "reqps"
+              "custom": {}
             },
             "overrides": []
           },
@@ -406,15 +413,16 @@ data:
             "y": 11
           },
           "hiddenSeries": false,
-          "id": 4,
+          "id": 7,
           "legend": {
-            "avg": false,
-            "current": false,
-            "max": false,
-            "min": false,
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
             "show": true,
             "total": false,
-            "values": false
+            "values": true
           },
           "lines": true,
           "linewidth": 1,
@@ -433,9 +441,9 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(rate(istio_requests_total{destination_service_name=~\"api-.*\", response_code=~\"5.*\"}[5m])) by (destination_service_name, response_code)",
+              "expr": "count(cortex_in_flight_requests{api_name=~\"$api_name\"}) by (api_name)",
               "interval": "",
-              "legendFormat": "{{destination_service_name}}",
+              "legendFormat": "{{api_name}}",
               "refId": "A"
             }
           ],
@@ -443,21 +451,12 @@ data:
           "timeFrom": null,
           "timeRegions": [],
           "timeShift": null,
-          "title": "5XX Responses",
+          "title": "Active Replicas",
           "tooltip": {
             "shared": true,
             "sort": 0,
             "value_type": "individual"
           },
-          "transformations": [
-            {
-              "id": "renameByRegex",
-              "options": {
-                "regex": "([^\\.]+)\\..+",
-                "renamePattern": "$1"
-              }
-            }
-          ],
           "type": "graph",
           "xaxis": {
             "buckets": null,
@@ -468,7 +467,8 @@ data:
           },
           "yaxes": [
             {
-              "format": "reqps",
+              "decimals": 0,
+              "format": "short",
               "label": null,
               "logBase": 1,
               "max": null,
@@ -494,33 +494,34 @@ data:
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "prometheus",
-          "description": "99th percentile for request latency per API, for all API IDs",
+          "datasource": null,
+          "description": "Request rate, computed over every minute, for responses with status code 4XX of an API",
           "fieldConfig": {
             "defaults": {
-              "custom": {},
-              "unit": "ms"
+              "custom": {}
             },
             "overrides": []
           },
           "fill": 1,
           "fillGradient": 0,
           "gridPos": {
-            "h": 8,
+            "h": 9,
             "w": 12,
             "x": 0,
             "y": 20
           },
           "hiddenSeries": false,
-          "id": 7,
+          "id": 9,
           "legend": {
-            "avg": false,
-            "current": false,
-            "max": false,
-            "min": false,
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
+            "rightSide": false,
             "show": true,
             "total": false,
-            "values": false
+            "values": true
           },
           "lines": true,
           "linewidth": 1,
@@ -539,17 +540,17 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "histogram_quantile(0.99, sum without (response_code) (rate(istio_request_duration_milliseconds_bucket{destination_service_name=~\"api-.*\"}[5m])))",
+              "expr": "sum(rate(istio_requests_total{destination_service_name=~\"api-$api_name.+\", response_code=~\"4.+\"}[1m])) by (destination_service_name, response_code)",
               "interval": "",
               "legendFormat": "{{destination_service_name}}",
-              "refId": "A"
+              "refId": "2XX"
             }
           ],
           "thresholds": [],
           "timeFrom": null,
           "timeRegions": [],
           "timeShift": null,
-          "title": "p99 Latency",
+          "title": "4XX Responses",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -562,6 +563,13 @@ data:
                 "regex": "([^\\.]+)\\..+",
                 "renamePattern": "$1"
               }
+            },
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "api-(.*)",
+                "renamePattern": "$1"
+              }
             }
           ],
           "type": "graph",
@@ -574,7 +582,7 @@ data:
           },
           "yaxes": [
             {
-              "format": "ms",
+              "format": "reqps",
               "label": null,
               "logBase": 1,
               "max": null,
@@ -600,33 +608,34 @@ data:
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "prometheus",
-          "description": "90th percentile for request latency per API, for all API IDs",
+          "datasource": null,
+          "description": "Request rate, computed over every minute, for responses with status code 5XX of an API",
           "fieldConfig": {
             "defaults": {
-              "custom": {},
-              "unit": "ms"
+              "custom": {}
             },
             "overrides": []
           },
           "fill": 1,
           "fillGradient": 0,
           "gridPos": {
-            "h": 8,
+            "h": 9,
             "w": 12,
             "x": 12,
             "y": 20
           },
           "hiddenSeries": false,
-          "id": 9,
+          "id": 10,
           "legend": {
-            "avg": false,
-            "current": false,
-            "max": false,
-            "min": false,
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
+            "rightSide": false,
             "show": true,
             "total": false,
-            "values": false
+            "values": true
           },
           "lines": true,
           "linewidth": 1,
@@ -645,17 +654,17 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "histogram_quantile(0.9, sum without (response_code) (rate(istio_request_duration_milliseconds_bucket{destination_service_name=~\"api-.*\"}[5m])))",
+              "expr": "sum(rate(istio_requests_total{destination_service_name=~\"api-$api_name.+\", response_code=~\"5.+\"}[5m])) by (destination_service_name, response_code)",
               "interval": "",
               "legendFormat": "{{destination_service_name}}",
-              "refId": "A"
+              "refId": "2XX"
             }
           ],
           "thresholds": [],
           "timeFrom": null,
           "timeRegions": [],
           "timeShift": null,
-          "title": "p90 Latency",
+          "title": "5XX Responses",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -668,6 +677,13 @@ data:
                 "regex": "([^\\.]+)\\..+",
                 "renamePattern": "$1"
               }
+            },
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "api-(.*)",
+                "renamePattern": "$1"
+              }
             }
           ],
           "type": "graph",
@@ -680,7 +696,7 @@ data:
           },
           "yaxes": [
             {
-              "format": "ms",
+              "format": "reqps",
               "label": null,
               "logBase": 1,
               "max": null,
@@ -706,33 +722,33 @@ data:
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "prometheus",
-          "description": "50th percentile for request latency per API, for all API IDs",
+          "datasource": null,
+          "description": "99th percentile latency, computed over a minute, for an API",
           "fieldConfig": {
             "defaults": {
-              "custom": {},
-              "unit": "ms"
+              "custom": {}
             },
             "overrides": []
           },
           "fill": 1,
           "fillGradient": 0,
           "gridPos": {
-            "h": 8,
+            "h": 9,
             "w": 12,
             "x": 0,
-            "y": 28
+            "y": 29
           },
           "hiddenSeries": false,
-          "id": 8,
+          "id": 6,
           "legend": {
-            "avg": false,
-            "current": false,
-            "max": false,
-            "min": false,
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
             "show": true,
             "total": false,
-            "values": false
+            "values": true
           },
           "lines": true,
           "linewidth": 1,
@@ -751,7 +767,7 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "histogram_quantile(0.5, sum without (response_code) (rate(istio_request_duration_milliseconds_bucket{destination_service_name=~\"api-.*\"}[5m])))",
+              "expr": "histogram_quantile(0.99, sum without (response_code) (rate(istio_request_duration_milliseconds_bucket{destination_service_name=~\"api-$api_name.+\"}[1m])))",
               "interval": "",
               "legendFormat": "{{destination_service_name}}",
               "refId": "A"
@@ -761,7 +777,7 @@ data:
           "timeFrom": null,
           "timeRegions": [],
           "timeShift": null,
-          "title": "p50 Latency",
+          "title": "p99 Latency",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -774,6 +790,13 @@ data:
                 "regex": "([^\\.]+)\\..+",
                 "renamePattern": "$1"
               }
+            },
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "api-(.*)",
+                "renamePattern": "$1"
+              }
             }
           ],
           "type": "graph",
@@ -812,33 +835,33 @@ data:
           "bars": false,
           "dashLength": 10,
           "dashes": false,
-          "datasource": "prometheus",
-          "description": "Average request latency per API, for all API IDs",
+          "datasource": null,
+          "description": "90th percentile latency, computed over a minute, for an API",
           "fieldConfig": {
             "defaults": {
-              "custom": {},
-              "unit": "ms"
+              "custom": {}
             },
             "overrides": []
           },
           "fill": 1,
           "fillGradient": 0,
           "gridPos": {
-            "h": 8,
+            "h": 9,
             "w": 12,
             "x": 12,
-            "y": 28
+            "y": 29
           },
           "hiddenSeries": false,
-          "id": 10,
+          "id": 11,
           "legend": {
-            "avg": false,
-            "current": false,
-            "max": false,
-            "min": false,
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
             "show": true,
             "total": false,
-            "values": false
+            "values": true
           },
           "lines": true,
           "linewidth": 1,
@@ -857,17 +880,18 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "  sum without (response_code) (rate(istio_request_duration_milliseconds_sum{destination_service_name=~\"api-.*\"}[5m]))\n/\n  sum without (response_code) (rate(istio_request_duration_milliseconds_count{destination_service_name=~\"api-.*\"}[5m]))",
+              "expr": "histogram_quantile(0.90, sum without (response_code) (rate(istio_request_duration_milliseconds_bucket{destination_service_name=~\"api-$api_name.+\"}[1m])))",
+              "hide": false,
               "interval": "",
               "legendFormat": "{{destination_service_name}}",
-              "refId": "A"
+              "refId": "B"
             }
           ],
           "thresholds": [],
           "timeFrom": null,
           "timeRegions": [],
           "timeShift": null,
-          "title": "Average Latency",
+          "title": "p90 Latency",
           "tooltip": {
             "shared": true,
             "sort": 0,
@@ -880,6 +904,13 @@ data:
                 "regex": "([^\\.]+)\\..+",
                 "renamePattern": "$1"
               }
+            },
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "api-(.*)",
+                "renamePattern": "$1"
+              }
             }
           ],
           "type": "graph",
@@ -919,7 +950,7 @@ data:
           "dashLength": 10,
           "dashes": false,
           "datasource": null,
-          "description": "Number of in-flight requests per API, for all API IDs",
+          "description": "50th percentile latency, computed over a minute, for an API",
           "fieldConfig": {
             "defaults": {
               "custom": {}
@@ -929,21 +960,22 @@ data:
           "fill": 1,
           "fillGradient": 0,
           "gridPos": {
-            "h": 8,
+            "h": 9,
             "w": 12,
             "x": 0,
-            "y": 36
+            "y": 38
           },
           "hiddenSeries": false,
-          "id": 14,
+          "id": 16,
           "legend": {
-            "avg": false,
-            "current": false,
-            "max": false,
-            "min": false,
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
             "show": true,
             "total": false,
-            "values": false
+            "values": true
           },
           "lines": true,
           "linewidth": 1,
@@ -962,20 +994,39 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "sum(cortex_in_flight_requests) by (api_name)",
+              "expr": "histogram_quantile(0.50, sum without (response_code) (rate(istio_request_duration_milliseconds_bucket{destination_service_name=~\"api-$api_name.+\"}[1m])))",
+              "hide": false,
               "interval": "",
-              "legendFormat": "api-{{api_name}}",
-              "refId": "A"
+              "legendFormat": "{{destination_service_name}}",
+              "refId": "B"
             }
           ],
           "thresholds": [],
+          "timeFrom": null,
           "timeRegions": [],
-          "title": "In-Flight Requests",
+          "timeShift": null,
+          "title": "p50 Latency",
           "tooltip": {
             "shared": true,
             "sort": 0,
             "value_type": "individual"
           },
+          "transformations": [
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "([^\\.]+)\\..+",
+                "renamePattern": "$1"
+              }
+            },
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "api-(.*)",
+                "renamePattern": "$1"
+              }
+            }
+          ],
           "type": "graph",
           "xaxis": {
             "buckets": null,
@@ -986,8 +1037,7 @@ data:
           },
           "yaxes": [
             {
-              "decimals": 0,
-              "format": "short",
+              "format": "ms",
               "label": null,
               "logBase": 1,
               "max": null,
@@ -1014,7 +1064,7 @@ data:
           "dashLength": 10,
           "dashes": false,
           "datasource": null,
-          "description": "Number of replicas per API, for all API IDs",
+          "description": "Average latency, computed over a minute, for an API",
           "fieldConfig": {
             "defaults": {
               "custom": {}
@@ -1024,21 +1074,22 @@ data:
           "fill": 1,
           "fillGradient": 0,
           "gridPos": {
-            "h": 8,
+            "h": 9,
             "w": 12,
             "x": 12,
-            "y": 36
+            "y": 38
           },
           "hiddenSeries": false,
-          "id": 15,
+          "id": 12,
           "legend": {
-            "avg": false,
-            "current": false,
-            "max": false,
-            "min": false,
+            "alignAsTable": true,
+            "avg": true,
+            "current": true,
+            "max": true,
+            "min": true,
             "show": true,
             "total": false,
-            "values": false
+            "values": true
           },
           "lines": true,
           "linewidth": 1,
@@ -1057,22 +1108,39 @@ data:
           "steppedLine": false,
           "targets": [
             {
-              "expr": "count(cortex_in_flight_requests) by (api_name)",
+              "expr": "sum(rate(istio_request_duration_milliseconds_sum{destination_service_name=~\"api-$api_name.+\"}[1m])) by (destination_service_name) / sum(rate(istio_request_duration_milliseconds_count{destination_service_name=~\"api-$api_name.+\"}[1m])) by (destination_service_name)",
+              "hide": false,
               "interval": "",
-              "legendFormat": "api-{{api_name}}",
-              "refId": "A"
+              "legendFormat": "{{destination_service_name}}",
+              "refId": "D"
             }
           ],
           "thresholds": [],
           "timeFrom": null,
           "timeRegions": [],
           "timeShift": null,
-          "title": "Replicas",
+          "title": "Average Latency",
           "tooltip": {
             "shared": true,
             "sort": 0,
             "value_type": "individual"
           },
+          "transformations": [
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "([^\\.]+)\\..+",
+                "renamePattern": "$1"
+              }
+            },
+            {
+              "id": "renameByRegex",
+              "options": {
+                "regex": "api-(.*)",
+                "renamePattern": "$1"
+              }
+            }
+          ],
           "type": "graph",
           "xaxis": {
             "buckets": null,
@@ -1083,8 +1151,7 @@ data:
           },
           "yaxes": [
             {
-              "decimals": 0,
-              "format": "short",
+              "format": "ms",
               "label": null,
               "logBase": 1,
               "max": null,
@@ -1111,7 +1178,57 @@ data:
       "style": "dark",
       "tags": [],
       "templating": {
-        "list": []
+        "list": [
+          {
+            "allValue": null,
+            "current": {
+              "selected": true,
+              "tags": [],
+              "text": [
+                "autoscaling",
+                "iris-classifier"
+              ],
+              "value": [
+                "autoscaling",
+                "iris-classifier"
+              ]
+            },
+            "datasource": null,
+            "definition": "label_values(cortex_in_flight_requests{api_kind=\"RealtimeAPI\"}, api_name)",
+            "description": null,
+            "error": null,
+            "hide": 0,
+            "includeAll": false,
+            "label": "API Name",
+            "multi": true,
+            "name": "api_name",
+            "options": [
+              {
+                "selected": true,
+                "text": "autoscaling",
+                "value": "autoscaling"
+              },
+              {
+                "selected": true,
+                "text": "iris-classifier",
+                "value": "iris-classifier"
+              }
+            ],
+            "query": {
+              "query": "label_values(cortex_in_flight_requests{api_kind=\"RealtimeAPI\"}, api_name)",
+              "refId": "StandardVariableQuery"
+            },
+            "refresh": 0,
+            "regex": "",
+            "skipUrlSync": false,
+            "sort": 0,
+            "tagValuesQuery": "",
+            "tags": [],
+            "tagsQuery": "",
+            "type": "query",
+            "useTags": false
+          }
+        ]
       },
       "time": {
         "from": "now-1h",
@@ -1120,6 +1237,6 @@ data:
       "timepicker": {},
       "timezone": "",
       "title": "RealtimeAPI",
-      "uid": "ZQv3auYGk",
-      "version": 1
+      "uid": "xvWFsZPGk",
+      "version": 5
     }
\ No newline at end of file

From f08f8294d5c6211cb56553b5aaf52ed2caa62a15 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Mon, 8 Feb 2021 16:43:24 +0100
Subject: [PATCH 09/26] Fix linting error

---
 manager/manifests/grafana/grafana-dashboard-realtime.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/manager/manifests/grafana/grafana-dashboard-realtime.yaml b/manager/manifests/grafana/grafana-dashboard-realtime.yaml
index 3d98b4d67a..04cd941180 100644
--- a/manager/manifests/grafana/grafana-dashboard-realtime.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-realtime.yaml
@@ -1239,4 +1239,4 @@ data:
       "title": "RealtimeAPI",
       "uid": "xvWFsZPGk",
       "version": 5
-    }
\ No newline at end of file
+    }

From 13897a0fb995a434a58b4671010eabbbe5bc3fc4 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Mon, 8 Feb 2021 17:02:02 +0100
Subject: [PATCH 10/26] Fix grafana installation

---
 manager/install.sh | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/manager/install.sh b/manager/install.sh
index 0590c3ef57..ceb71cd8fd 100755
--- a/manager/install.sh
+++ b/manager/install.sh
@@ -68,7 +68,7 @@ function cluster_up_aws() {
   echo -n "￮ configuring metrics "
   envsubst < manifests/metrics-server.yaml | kubectl apply -f - >/dev/null
   setup_prometheus
-  kubectl apply -f manifests/grafana/ >/dev/null
+  setup_grafana
   echo "✓"
 
   if [[ "$CORTEX_INSTANCE_TYPE" == p* ]] || [[ "$CORTEX_INSTANCE_TYPE" == g* ]]; then
@@ -126,7 +126,7 @@ function cluster_up_gcp() {
 
   echo -n "￮ configuring metrics "
   setup_prometheus
-  kubectl apply -f /workspace/grafana/ >/dev/null
+  setup_grafana
   echo "✓"
 
   if [ -n "$CORTEX_ACCELERATOR_TYPE" ]; then
@@ -306,6 +306,12 @@ function setup_prometheus() {
   python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/prometheus-monitoring.yaml.j2 | kubectl apply -f - >/dev/null
 }
 
+function setup_grafana() {
+  kubectl manifests/grafana/grafana-dashboard-realtime.yaml >/dev/null
+  kubectl manifests/grafana/grafana-dashboard-batch.yaml >/dev/null
+  envsubst < manifests/grafana/grafana.yaml | kubectl apply -f - >/dev/null
+}
+
 function setup_secrets_gcp() {
   kubectl create secret generic 'gcp-credentials' --from-file=key.json=$GOOGLE_APPLICATION_CREDENTIALS >/dev/null
 }

From 2c54d2b5d0b468ebc3257d04f196382958f1bc1d Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Mon, 8 Feb 2021 17:03:09 +0100
Subject: [PATCH 11/26] Make grafana service ClusterIP type

---
 manager/manifests/grafana/grafana.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/manager/manifests/grafana/grafana.yaml b/manager/manifests/grafana/grafana.yaml
index 9230dc6167..fbd7d4b2bb 100644
--- a/manager/manifests/grafana/grafana.yaml
+++ b/manager/manifests/grafana/grafana.yaml
@@ -166,13 +166,13 @@ metadata:
   name: grafana
   namespace: default
 spec:
+  type: ClusterIP
   ports:
     - name: http
       port: 3000
       targetPort: http
   selector:
     app: grafana
-  type: NodePort
 
 ---
 

From beb4c663753cb4ae6a900f132de2fc37b29f49fc Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Mon, 8 Feb 2021 17:03:31 +0100
Subject: [PATCH 12/26] Use custom grafana image in manifest

---
 manager/manifests/grafana/grafana.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/manager/manifests/grafana/grafana.yaml b/manager/manifests/grafana/grafana.yaml
index fbd7d4b2bb..0fbc509bf5 100644
--- a/manager/manifests/grafana/grafana.yaml
+++ b/manager/manifests/grafana/grafana.yaml
@@ -96,7 +96,7 @@ spec:
         app: grafana
     spec:
       containers:
-        - image: grafana/grafana:7.4.0
+        - image: $CORTEX_IMAGE_GRAFANA
           name: grafana
           ports:
             - containerPort: 3000

From c7018c1e694fa31af84a32986b1879cb2dfa7252 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Mon, 8 Feb 2021 17:44:45 +0100
Subject: [PATCH 13/26] Add grafana dashboard URL to cortex CLI. Remove old
 dashboard code from operator

---
 cli/cmd/cluster.go                            |  40 -----
 pkg/consts/consts.go                          |   1 -
 pkg/operator/operator/k8s.go                  |  13 +-
 pkg/operator/resources/realtimeapi/api.go     |  67 +++-----
 .../resources/realtimeapi/dashboard.go        | 153 ------------------
 5 files changed, 34 insertions(+), 240 deletions(-)
 delete mode 100644 pkg/operator/resources/realtimeapi/dashboard.go

diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go
index 92c70083df..f4d8513027 100644
--- a/cli/cmd/cluster.go
+++ b/cli/cmd/cluster.go
@@ -28,7 +28,6 @@ import (
 	"github.com/aws/aws-sdk-go/service/elbv2"
 	"github.com/cortexlabs/cortex/cli/cluster"
 	"github.com/cortexlabs/cortex/cli/types/cliconfig"
-	"github.com/cortexlabs/cortex/pkg/consts"
 	"github.com/cortexlabs/cortex/pkg/lib/archive"
 	"github.com/cortexlabs/cortex/pkg/lib/aws"
 	"github.com/cortexlabs/cortex/pkg/lib/console"
@@ -208,11 +207,6 @@ var _clusterUpCmd = &cobra.Command{
 			exit.Error(err)
 		}
 
-		//err = createOrClearDashboard(awsClient, clusterConfig.ClusterName)
-		//if err != nil {
-		//	exit.Error(err)
-		//}
-
 		out, exitCode, err := runManagerWithClusterConfig("/root/install.sh", clusterConfig, awsCreds, nil, nil)
 		if err != nil {
 			exit.Error(err)
@@ -481,16 +475,6 @@ var _clusterDownCmd = &cobra.Command{
 			prompt.YesOrExit(fmt.Sprintf("your cluster named \"%s\" in %s will be spun down and all apis will be deleted, are you sure you want to continue?", *accessConfig.ClusterName, *accessConfig.Region), "", "")
 		}
 
-		//fmt.Print("￮ deleting dashboard ")
-		//err = awsClient.DeleteDashboard(*accessConfig.ClusterName)
-		//if err != nil {
-		//	fmt.Printf("\n\nunable to delete cortex's api dashboard (see error below); if it still exists after the cluster has been deleted, please delete it via the cloudwatch console: https://%s.console.aws.amazon.com/cloudwatch/home#dashboards:\n", *accessConfig.Region)
-		//	errors.PrintError(err)
-		//	fmt.Println()
-		//} else {
-		//	fmt.Println("✓")
-		//}
-
 		fmt.Print("￮ deleting sqs queues ")
 		err = awsClient.DeleteQueuesWithPrefix(clusterconfig.SQSNamePrefix(*accessConfig.ClusterName))
 		if err != nil {
@@ -1037,30 +1021,6 @@ func createLogGroupIfNotFound(awsClient *aws.Client, logGroup string, tags map[s
 	return nil
 }
 
-// createOrClearDashboard creates a new dashboard (or clears an existing one if it already exists)
-func createOrClearDashboard(awsClient *aws.Client, dashboardName string) error {
-	dashboardFound, err := awsClient.DoesDashboardExist(dashboardName)
-	if err != nil {
-		return err
-	}
-
-	if dashboardFound {
-		fmt.Print("￮ using existing cloudwatch dashboard: ", dashboardName)
-	} else {
-		fmt.Print("￮ creating cloudwatch dashboard: ", dashboardName)
-	}
-
-	err = awsClient.CreateDashboard(dashboardName, consts.DashboardTitle)
-	if err != nil {
-		fmt.Print("\n\n")
-		return err
-	}
-
-	fmt.Println(" ✓")
-
-	return nil
-}
-
 // Will return error if load balancer can't be found
 func getAWSOperatorLoadBalancer(clusterName string, awsClient *aws.Client) (*elbv2.LoadBalancer, error) {
 	loadBalancer, err := awsClient.FindLoadBalancer(map[string]string{
diff --git a/pkg/consts/consts.go b/pkg/consts/consts.go
index 3c2f9dda39..3dc750779e 100644
--- a/pkg/consts/consts.go
+++ b/pkg/consts/consts.go
@@ -50,7 +50,6 @@ var (
 		DefaultImageONNXPredictorGPU,
 	)
 
-	DashboardTitle               = "# cortex monitoring dashboard"
 	DefaultMaxReplicaConcurrency = int64(1024)
 	NeuronCoresPerInf            = int64(4)
 )
diff --git a/pkg/operator/operator/k8s.go b/pkg/operator/operator/k8s.go
index 284b7c883f..25e74afa3f 100644
--- a/pkg/operator/operator/k8s.go
+++ b/pkg/operator/operator/k8s.go
@@ -1100,9 +1100,18 @@ func K8sName(apiName string) string {
 	return "api-" + apiName
 }
 
-// APILoadBalancerURL returns http endpoint of cluster ingress load balancer
+// APILoadBalancerURL returns the http endpoint of the ingress load balancer for deployed APIs
 func APILoadBalancerURL() (string, error) {
-	service, err := config.K8sIstio.GetService("ingressgateway-apis")
+	return getLoadBalancerURL("ingressgateway-apis")
+}
+
+// LoadBalancerURL returns the http endpoint of the ingress load balancer for the operator
+func LoadBalancerURL() (string, error) {
+	return getLoadBalancerURL("ingressgateway-operator")
+}
+
+func getLoadBalancerURL(name string) (string, error) {
+	service, err := config.K8sIstio.GetService(name)
 	if err != nil {
 		return "", err
 	}
diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go
index 4a7078c5d1..759ea5d78d 100644
--- a/pkg/operator/resources/realtimeapi/api.go
+++ b/pkg/operator/resources/realtimeapi/api.go
@@ -24,6 +24,7 @@ import (
 	"github.com/cortexlabs/cortex/pkg/lib/errors"
 	"github.com/cortexlabs/cortex/pkg/lib/k8s"
 	"github.com/cortexlabs/cortex/pkg/lib/parallel"
+	"github.com/cortexlabs/cortex/pkg/lib/pointer"
 	"github.com/cortexlabs/cortex/pkg/operator/config"
 	"github.com/cortexlabs/cortex/pkg/operator/lib/routines"
 	"github.com/cortexlabs/cortex/pkg/operator/operator"
@@ -37,6 +38,8 @@ import (
 	kcore "k8s.io/api/core/v1"
 )
 
+const _realtimeDashboardUID = "xvWFsZPGk"
+
 var _autoscalerCrons = make(map[string]cron.Cron) // apiName -> cron
 
 func deploymentID() string {
@@ -73,13 +76,6 @@ func UpdateAPI(apiConfig *userconfig.API, projectID string, force bool) (*spec.A
 			return nil, "", err
 		}
 
-		//if config.Provider == types.AWSProviderType {
-		//	err = addAPIToDashboard(config.ClusterName(), api.Name)
-		//	if err != nil {
-		//		errors.PrintError(err)
-		//	}
-		//}
-
 		return api, fmt.Sprintf("creating %s", api.Resource.UserString()), nil
 	}
 
@@ -176,25 +172,6 @@ func DeleteAPI(apiName string, keepCache bool) error {
 			deleteBucketResources(apiName)
 			return nil
 		},
-		// delete api from cloudwatch dashboard
-		//func() error {
-		//	if config.Provider == types.AWSProviderType {
-		//		virtualServices, err := config.K8s.ListVirtualServicesByLabel("apiKind", userconfig.RealtimeAPIKind.String())
-		//		if err != nil {
-		//			return errors.Wrap(err, "failed to get virtual services")
-		//		}
-		//		// extract all api names from statuses
-		//		allAPINames := make([]string, len(virtualServices))
-		//		for i, virtualService := range virtualServices {
-		//			allAPINames[i] = virtualService.Labels["apiName"]
-		//		}
-		//		err = removeAPIFromDashboard(allAPINames, config.ClusterName(), apiName)
-		//		if err != nil {
-		//			return errors.Wrap(err, "failed to delete API from dashboard")
-		//		}
-		//	}
-		//	return nil
-		//},
 	)
 
 	if err != nil {
@@ -273,18 +250,15 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp
 		return nil, err
 	}
 
-	//var dashboardURL *string
-	//if config.Provider == types.AWSProviderType {
-	//	dashboardURL = pointer.String(DashboardURL())
-	//}
+	dashboardURL := pointer.String(getDashboardURL(api.Name))
 
 	return []schema.APIResponse{
 		{
-			Spec:     *api,
-			Status:   status,
-			Metrics:  metrics,
-			Endpoint: apiEndpoint,
-			//DashboardURL: dashboardURL,
+			Spec:         *api,
+			Status:       status,
+			Metrics:      metrics,
+			Endpoint:     apiEndpoint,
+			DashboardURL: dashboardURL,
 		},
 	}, nil
 }
@@ -428,15 +402,6 @@ func deleteBucketResources(apiName string) error {
 	return config.DeleteBucketDir(prefix, true)
 }
 
-func IsAPIUpdating(apiName string) (bool, error) {
-	deployment, err := config.K8s.GetDeployment(operator.K8sName(apiName))
-	if err != nil {
-		return false, err
-	}
-
-	return isAPIUpdating(deployment)
-}
-
 // returns true if min_replicas are not ready and no updated replicas have errored
 func isAPIUpdating(deployment *kapps.Deployment) (bool, error) {
 	pods, err := config.K8s.ListPodsByLabel("apiName", deployment.Labels["apiName"])
@@ -462,3 +427,17 @@ func isPodSpecLatest(deployment *kapps.Deployment, pod *kcore.Pod) bool {
 	return deployment.Spec.Template.Labels["predictorID"] == pod.Labels["predictorID"] &&
 		deployment.Spec.Template.Labels["deploymentID"] == pod.Labels["deploymentID"]
 }
+
+func getDashboardURL(apiName string) string {
+	loadBalancerURL, err := operator.LoadBalancerURL()
+	if err != nil {
+		return ""
+	}
+
+	dashboardURL := fmt.Sprintf(
+		"%s/dashboard/d/%s/realtimeapi?orgId=1&refresh=30s&var-api_name=%s",
+		loadBalancerURL, _realtimeDashboardUID, apiName,
+	)
+
+	return dashboardURL
+}
diff --git a/pkg/operator/resources/realtimeapi/dashboard.go b/pkg/operator/resources/realtimeapi/dashboard.go
deleted file mode 100644
index 9996e02476..0000000000
--- a/pkg/operator/resources/realtimeapi/dashboard.go
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
-Copyright 2021 Cortex Labs, Inc.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-*/
-
-package realtimeapi
-
-import (
-	"fmt"
-
-	"github.com/cortexlabs/cortex/pkg/consts"
-	"github.com/cortexlabs/cortex/pkg/lib/aws"
-	"github.com/cortexlabs/cortex/pkg/lib/errors"
-	"github.com/cortexlabs/cortex/pkg/operator/config"
-	"github.com/cortexlabs/cortex/pkg/types"
-)
-
-func addAPIToDashboard(dashboardName string, apiName string) error {
-	// get current dashboard from cloudwatch (or a new dashboard if it was deleted)
-	dashboard, err := config.AWS.GetDashboardOrEmpty(dashboardName, consts.DashboardTitle)
-	if err != nil {
-		return err
-	}
-
-	err = addAPIToDashboardObject(dashboard, dashboardName, apiName)
-	if err != nil {
-		return err
-	}
-
-	err = config.AWS.PutDashboard(dashboard, dashboardName)
-	if err != nil {
-		return err
-	}
-
-	return nil
-}
-
-func removeAPIFromDashboard(allAPINames []string, dashboardName string, apiToRemove string) error {
-	// create a new base dashboard
-	dashboard := config.AWS.NewDashboard(consts.DashboardTitle)
-
-	// update dashboard by adding all APIs except the one to delete
-	for _, apiName := range allAPINames {
-		if apiName == apiToRemove {
-			continue
-		}
-		err := addAPIToDashboardObject(dashboard, dashboardName, apiName)
-		if err != nil {
-			return err
-		}
-	}
-
-	err := config.AWS.PutDashboard(dashboard, dashboardName)
-	if err != nil {
-		return err
-	}
-
-	return nil
-}
-
-func addAPIToDashboardObject(dashboard *aws.CloudWatchDashboard, dashboardName string, apiName string) error {
-	// get lowest element on the dashboard (need to place new widgets below all existing widgets)
-	highestY, err := aws.HighestY(dashboard)
-	if err != nil {
-		return errors.Wrap(err, fmt.Sprintf("failed to add API \"%s\" to cloudwatch dashboard", apiName))
-	}
-
-	// create widget for title
-	dashboard.Widgets = append(dashboard.Widgets, aws.TextWidget(1, highestY+1, 22, 1, "## "+apiName))
-
-	grid, err := aws.NewVerticalGrid(1, highestY+2, 6, 11, 3)
-	if err != nil {
-		return nil
-	}
-
-	// first grid column
-	grid.AddWidget(statusCodeMetric(dashboardName, apiName), "responses per minute", "Sum", 60, config.AWS.Region)
-	grid.AddWidget(latencyMetric(dashboardName, apiName), "median response time (ms)", "p50", 60, config.AWS.Region)
-	grid.AddWidget(latencyMetric(dashboardName, apiName), "p99 response time (ms)", "p99", 60, config.AWS.Region)
-
-	// second grid column
-	grid.AddWidget(inFlightMetric(dashboardName, apiName), "total in-flight requests", "Sum", 10, config.AWS.Region)
-	grid.AddWidget(inFlightMetric(dashboardName, apiName), "avg in-flight requests per replica", "Average", 10, config.AWS.Region)
-	// setting the period to 10 seconds because the publishing frequency of the request monitor is 10 seconds
-	grid.AddWidget(inFlightMetric(dashboardName, apiName), "active replicas", "SampleCount", 10, config.AWS.Region)
-
-	// append new API metrics widgets to existing widgets
-	dashboard.Widgets = append(dashboard.Widgets, grid.Widgets...)
-
-	return nil
-}
-
-func inFlightMetric(dashboardName string, apiName string) []interface{} {
-	var metric []interface{}
-	metric = append(metric, dashboardName)
-	metric = append(metric, "in-flight")
-	metric = append(metric, "apiName")
-	metric = append(metric, apiName)
-
-	return []interface{}{metric}
-}
-
-func latencyMetric(dashboardName string, apiName string) []interface{} {
-	var metric []interface{}
-	metric = append(metric, dashboardName)
-	metric = append(metric, "Latency")
-	metric = append(metric, "APIName")
-	metric = append(metric, apiName)
-	metric = append(metric, "metric_type")
-	metric = append(metric, "histogram")
-
-	return []interface{}{metric}
-}
-
-func statusCodeMetric(dashboardName string, apiName string) []interface{} {
-	var metric2XX []interface{}
-	metric2XX = append(metric2XX, dashboardName)
-	metric2XX = append(metric2XX, "StatusCode")
-	metric2XX = append(metric2XX, "APIName")
-	metric2XX = append(metric2XX, apiName)
-	metric2XX = append(metric2XX, "metric_type")
-	metric2XX = append(metric2XX, "counter")
-	metric2XX = append(metric2XX, "Code")
-	metric2XX = append(metric2XX, "2XX")
-
-	var metric4XX []interface{}
-	metric4XX = append(metric4XX, "...")
-	metric4XX = append(metric4XX, "4XX")
-
-	var metric5XX []interface{}
-	metric5XX = append(metric5XX, "...")
-	metric5XX = append(metric5XX, "5XX")
-
-	return []interface{}{metric2XX, metric4XX, metric5XX}
-}
-
-func DashboardURL() string {
-	if config.Provider == types.AWSProviderType {
-		return fmt.Sprintf("https://%s.console.aws.amazon.com/cloudwatch/home#dashboards:name=%s", *config.CoreConfig.Region, config.ClusterName())
-	}
-	return ""
-}

From ea128b6823be28d0d95f019c9864cc3ce4e108c2 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Mon, 8 Feb 2021 19:42:16 +0100
Subject: [PATCH 14/26] Update metrics.md

---
 docs/workloads/realtime/metrics.md | 55 +++++++++++++++---------------
 1 file changed, 27 insertions(+), 28 deletions(-)

diff --git a/docs/workloads/realtime/metrics.md b/docs/workloads/realtime/metrics.md
index b1e3cf6e31..792458cbd7 100644
--- a/docs/workloads/realtime/metrics.md
+++ b/docs/workloads/realtime/metrics.md
@@ -11,42 +11,41 @@ aws   text-generator              live     1            1           8m
 aws   image-classifier-resnet50   live     2            2           1h            32ms          1121126
 ```
 
-The `cortex get API_NAME` command also provides a link to a CloudWatch Metrics dashboard containing this information:
+The `cortex get API_NAME` command also provides a link to a Grafana dashboard:
 
-![dashboard](https://user-images.githubusercontent.com/808475/86186297-8cc5a500-baed-11ea-885f-d5c301b049eb.png)
+![dashboard](https://user-images.githubusercontent.com/7456627/107253455-9c6b7b80-6a36-11eb-8600-f36a7bab6d3b.png)
 
-**responses per minute**
-
-Shows the number of 2XX, 4XX, and 5XX responses per minute.
-
-**median response time**
-
-Shows the median response time for requests, over 1-minute periods (measured in milliseconds).
-
-**p99 response time**
-
-Shows the p99 response time for requests, over 1-minute periods (measured in milliseconds).
-
-**total in-flight requests**
-
-Shows the total number of in-flight requests.
-
-See [metric intervals](#metric-intervals).
+---
 
-**avg in-flight requests per replica**
+## Metrics in the dashboard
+
+| Panel             | Description                                                                        | Note                                                                                               |
+|-------------------|------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------|
+| Request Rate      | Request rate, computed over every minute, of an API                                |                                                                                                    |
+| In Flight Request | Active in-flight requests for an API.                                              | In-flight requests are recorded every 10 seconds, which will correspond to the minimum resolution. |
+| Active Replicas   | Active replicas for an API                                                         |                                                                                                    |
+| 2XX Responses     | Request rate, computed over a minute, for responses with status code 2XX of an API |                                                                                                    |
+| 4XX Responses     | Request rate, computed over a minute, for responses with status code 4XX of an API |                                                                                                    |
+| 5XX Responses     | Request rate, computed over a minute, for responses with status code 5XX of an API |                                                                                                    |
+| p99 Latency       | 99th percentile latency, computed over a minute, for an API                        | Value might not be accurate because the histogram buckets are not dynamically set.                 |
+| p90 Latency       | 90th percentile latency, computed over a minute, for an API                        | Value might not be accurate because the histogram buckets are not dynamically set.                 |
+| p50 Latency       | 50th percentile latency, computed over a minute, for an API                        | Value might not be accurate because the histogram buckets are not dynamically set.                 |
+| Average Latency   | Average latency, computed over a minute, for an API                                |                                                                                                    |
 
-Shows the average number of in-flight requests per replica.
+---
 
-See [metric intervals](#metric-intervals).
+## Accessing the dashboard
 
-**active replicas**
+The dashboard is displayed once you run a `cortex get <api_name>` command.
 
-Shows the number of active replicas.
+### Default credentials
 
-See [metric intervals](#metric-intervals).
+The dashboard is protected with username / password authentication, which by default are:
 
----
+ - Username: admin
+ - Password: admin
 
-#### Metric intervals
+You will be prompted to change the admin user password in the first time you log in.
 
-The referenced widget is aggregated over 10 second intervals because each replica reports its in-flight requests once per 10 seconds. This plot is only available for the last 3 hours (because second-granular data is aggregated to minute-granular data after 3 hours). To plot data older than 3 hours, instead change the period to 1 minute, and divide the y-axis by 6 to (since the metrics are reported every 10 seconds).*
+Grafana allows managing the access of several users and managing teams. 
+For more information on this topic check the [grafana documentation](https://grafana.com/docs/grafana/latest/manage-users/).

From 5f4bfb24cb1d5eac0e12d38a37d9d78394ea206d Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Mon, 8 Feb 2021 19:45:47 +0100
Subject: [PATCH 15/26] Update metrics.md

---
 docs/workloads/realtime/metrics.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/docs/workloads/realtime/metrics.md b/docs/workloads/realtime/metrics.md
index 792458cbd7..005ad62ba3 100644
--- a/docs/workloads/realtime/metrics.md
+++ b/docs/workloads/realtime/metrics.md
@@ -1,6 +1,7 @@
 # Metrics
 
-The `cortex get` and `cortex get API_NAME` commands display the request time (averaged over the past 2 weeks) and response code counts (summed over the past 2 weeks) for your APIs:
+The `cortex get` and `cortex get API_NAME` commands display the request time (averaged over the past 2 weeks) and
+response code counts (summed over the past 2 weeks) for your APIs:
 
 ```bash
 $ cortex get
@@ -42,10 +43,10 @@ The dashboard is displayed once you run a `cortex get <api_name>` command.
 
 The dashboard is protected with username / password authentication, which by default are:
 
- - Username: admin
- - Password: admin
+- Username: admin
+- Password: admin
 
 You will be prompted to change the admin user password in the first time you log in.
 
-Grafana allows managing the access of several users and managing teams. 
-For more information on this topic check the [grafana documentation](https://grafana.com/docs/grafana/latest/manage-users/).
+Grafana allows managing the access of several users and managing teams. For more information on this topic check
+the [grafana documentation](https://grafana.com/docs/grafana/latest/manage-users/).

From 151348c8861284e073b0f664e34c9066c51c5bc4 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Tue, 9 Feb 2021 13:00:00 +0100
Subject: [PATCH 16/26] Remove metrics cloud exporters and add grafana to Helm
 chart

---
 charts/dashboards/batch.json                  |  401 ++++++
 charts/dashboards/realtime.json               | 1221 +++++++++++++++++
 charts/templates/grafana.yaml                 |  226 +++
 charts/templates/prometheus-monitoring.yaml   |   19 +-
 .../templates/prometheus-to-cloudwatch.yaml   |   47 -
 charts/values.yaml                            |    9 +-
 6 files changed, 1855 insertions(+), 68 deletions(-)
 create mode 100644 charts/dashboards/batch.json
 create mode 100644 charts/dashboards/realtime.json
 create mode 100644 charts/templates/grafana.yaml
 delete mode 100644 charts/templates/prometheus-to-cloudwatch.yaml

diff --git a/charts/dashboards/batch.json b/charts/dashboards/batch.json
new file mode 100644
index 0000000000..d5d238ab53
--- /dev/null
+++ b/charts/dashboards/batch.json
@@ -0,0 +1,401 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "prometheus",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "links": [],
+  "panels": [
+    {
+      "datasource": null,
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 2,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 7,
+      "options": {
+        "content": "<h1 style=\"text-align: center\">BatchAPI</h1>\n",
+        "mode": "markdown"
+      },
+      "pluginVersion": "7.4.0",
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "",
+      "transparent": true,
+      "type": "text"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Number of succeeded batches for an API",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 0,
+        "y": 2
+      },
+      "hiddenSeries": false,
+      "id": 2,
+      "legend": {
+        "alignAsTable": true,
+        "avg": false,
+        "current": true,
+        "max": true,
+        "min": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.0",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(cortex_batch_succeeded{api_name=~\"$api_name\"}) by (api_name)",
+          "interval": "",
+          "legendFormat": "{{api_name}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "# Succeeded Batches",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "$$hashKey": "object:26",
+          "decimals": 0,
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "$$hashKey": "object:27",
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Number of failed batches for an API",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 2
+      },
+      "hiddenSeries": false,
+      "id": 3,
+      "legend": {
+        "alignAsTable": true,
+        "avg": false,
+        "current": true,
+        "max": true,
+        "min": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.0",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(cortex_batch_failed{api_name=~\"$api_name\"}) by (api_name)",
+          "interval": "",
+          "legendFormat": "{{api_name}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "# Failed Batches",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "decimals": 0,
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Average time per batch for an API",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 8,
+        "w": 12,
+        "x": 6,
+        "y": 11
+      },
+      "hiddenSeries": false,
+      "id": 5,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.0",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(cortex_time_per_batch_sum{api_name=~\"$api_name\"}) by (api_name) / sum(cortex_time_per_batch_count{api_name=~\"$api_name\"}) by (api_name)",
+          "interval": "",
+          "legendFormat": "{{api_name}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Average Time per Batch",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "ms",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 27,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "allValue": null,
+        "current": {
+          "selected": true,
+          "tags": [],
+          "text": [
+            "image-classifier"
+          ],
+          "value": [
+            "image-classifier"
+          ]
+        },
+        "datasource": null,
+        "definition": "label_values({__name__=~\"cortex_batch_.+\"}, api_name)",
+        "description": null,
+        "error": null,
+        "hide": 0,
+        "includeAll": false,
+        "label": "API Name",
+        "multi": true,
+        "name": "api_name",
+        "options": [
+          {
+            "selected": true,
+            "text": "image-classifier",
+            "value": "image-classifier"
+          }
+        ],
+        "query": {
+          "query": "label_values({__name__=~\"cortex_batch_.+\"}, api_name)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 0,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "BatchAPI",
+  "uid": "nEiYFWEMk",
+  "version": 1
+}
diff --git a/charts/dashboards/realtime.json b/charts/dashboards/realtime.json
new file mode 100644
index 0000000000..61c851cbe5
--- /dev/null
+++ b/charts/dashboards/realtime.json
@@ -0,0 +1,1221 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": "prometheus",
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "gnetId": null,
+  "graphTooltip": 0,
+  "id": 10,
+  "iteration": 1612793050833,
+  "links": [],
+  "panels": [
+    {
+      "datasource": null,
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "gridPos": {
+        "h": 2,
+        "w": 24,
+        "x": 0,
+        "y": 0
+      },
+      "id": 15,
+      "options": {
+        "content": "<h1 style=\"text-align: center\">RealtimeAPI</h1>",
+        "mode": "markdown"
+      },
+      "pluginVersion": "7.4.0",
+      "timeFrom": null,
+      "timeShift": null,
+      "title": "",
+      "transparent": true,
+      "type": "text"
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Request rate, computed over every minute, of an API",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 0,
+        "y": 2
+      },
+      "hiddenSeries": false,
+      "id": 2,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "rightSide": false,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.0",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum (rate(istio_requests_total{destination_service_name=~\"api-$api_name.+\"}[1m])) by (destination_service_name)",
+          "interval": "",
+          "legendFormat": "{{destination_service_name}}",
+          "refId": "2XX"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Request Rate",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "transformations": [
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "([^\\.]+)\\..+",
+            "renamePattern": "$1"
+          }
+        },
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "api-(.*)",
+            "renamePattern": "$1"
+          }
+        }
+      ],
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "reqps",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Active in-flight requests for an API.\n\nNote: In-flight requests are recorded every 10 seconds, which will correspond to the minimum resolution.",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 2
+      },
+      "hiddenSeries": false,
+      "id": 4,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.0",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(cortex_in_flight_requests{api_name=~\"$api_name\"}) by (api_name)",
+          "interval": "",
+          "legendFormat": "{{api_name}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "In-Flight Requests",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "decimals": 0,
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Request rate, computed over every minute, for responses with status code 2XX of an API",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 0,
+        "y": 11
+      },
+      "hiddenSeries": false,
+      "id": 8,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "rightSide": false,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.0",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(rate(istio_requests_total{destination_service_name=~\"api-$api_name.+\", response_code=~\"2.*\"}[1m])) by (destination_service_name, response_code)",
+          "interval": "",
+          "legendFormat": "{{destination_service_name}}",
+          "refId": "2XX"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "2XX Responses",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "transformations": [
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "([^\\.]+)\\..+",
+            "renamePattern": "$1"
+          }
+        },
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "api-(.*)",
+            "renamePattern": "$1"
+          }
+        }
+      ],
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "reqps",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 11
+      },
+      "hiddenSeries": false,
+      "id": 7,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.0",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "count(cortex_in_flight_requests{api_name=~\"$api_name\"}) by (api_name)",
+          "interval": "",
+          "legendFormat": "{{api_name}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Active Replicas",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "decimals": 0,
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Request rate, computed over every minute, for responses with status code 4XX of an API",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 0,
+        "y": 20
+      },
+      "hiddenSeries": false,
+      "id": 9,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "rightSide": false,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.0",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(rate(istio_requests_total{destination_service_name=~\"api-$api_name.+\", response_code=~\"4.+\"}[1m])) by (destination_service_name, response_code)",
+          "interval": "",
+          "legendFormat": "{{destination_service_name}}",
+          "refId": "2XX"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "4XX Responses",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "transformations": [
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "([^\\.]+)\\..+",
+            "renamePattern": "$1"
+          }
+        },
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "api-(.*)",
+            "renamePattern": "$1"
+          }
+        }
+      ],
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "reqps",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Request rate, computed over every minute, for responses with status code 5XX of an API",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 20
+      },
+      "hiddenSeries": false,
+      "id": 10,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "rightSide": false,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.0",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(rate(istio_requests_total{destination_service_name=~\"api-$api_name.+\", response_code=~\"5.+\"}[5m])) by (destination_service_name, response_code)",
+          "interval": "",
+          "legendFormat": "{{destination_service_name}}",
+          "refId": "2XX"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "5XX Responses",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "transformations": [
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "([^\\.]+)\\..+",
+            "renamePattern": "$1"
+          }
+        },
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "api-(.*)",
+            "renamePattern": "$1"
+          }
+        }
+      ],
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "reqps",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "99th percentile latency, computed over a minute, for an API",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 0,
+        "y": 29
+      },
+      "hiddenSeries": false,
+      "id": 6,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.0",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.99, sum without (response_code) (rate(istio_request_duration_milliseconds_bucket{destination_service_name=~\"api-$api_name.+\"}[1m])))",
+          "interval": "",
+          "legendFormat": "{{destination_service_name}}",
+          "refId": "A"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "p99 Latency",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "transformations": [
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "([^\\.]+)\\..+",
+            "renamePattern": "$1"
+          }
+        },
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "api-(.*)",
+            "renamePattern": "$1"
+          }
+        }
+      ],
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "ms",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "90th percentile latency, computed over a minute, for an API",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 29
+      },
+      "hiddenSeries": false,
+      "id": 11,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.0",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.90, sum without (response_code) (rate(istio_request_duration_milliseconds_bucket{destination_service_name=~\"api-$api_name.+\"}[1m])))",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "{{destination_service_name}}",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "p90 Latency",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "transformations": [
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "([^\\.]+)\\..+",
+            "renamePattern": "$1"
+          }
+        },
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "api-(.*)",
+            "renamePattern": "$1"
+          }
+        }
+      ],
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "ms",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "50th percentile latency, computed over a minute, for an API",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 0,
+        "y": 38
+      },
+      "hiddenSeries": false,
+      "id": 16,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.0",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "histogram_quantile(0.50, sum without (response_code) (rate(istio_request_duration_milliseconds_bucket{destination_service_name=~\"api-$api_name.+\"}[1m])))",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "{{destination_service_name}}",
+          "refId": "B"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "p50 Latency",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "transformations": [
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "([^\\.]+)\\..+",
+            "renamePattern": "$1"
+          }
+        },
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "api-(.*)",
+            "renamePattern": "$1"
+          }
+        }
+      ],
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "ms",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    },
+    {
+      "aliasColors": {},
+      "bars": false,
+      "dashLength": 10,
+      "dashes": false,
+      "datasource": null,
+      "description": "Average latency, computed over a minute, for an API",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {}
+        },
+        "overrides": []
+      },
+      "fill": 1,
+      "fillGradient": 0,
+      "gridPos": {
+        "h": 9,
+        "w": 12,
+        "x": 12,
+        "y": 38
+      },
+      "hiddenSeries": false,
+      "id": 12,
+      "legend": {
+        "alignAsTable": true,
+        "avg": true,
+        "current": true,
+        "max": true,
+        "min": true,
+        "show": true,
+        "total": false,
+        "values": true
+      },
+      "lines": true,
+      "linewidth": 1,
+      "nullPointMode": "null",
+      "options": {
+        "alertThreshold": true
+      },
+      "percentage": false,
+      "pluginVersion": "7.4.0",
+      "pointradius": 2,
+      "points": false,
+      "renderer": "flot",
+      "seriesOverrides": [],
+      "spaceLength": 10,
+      "stack": false,
+      "steppedLine": false,
+      "targets": [
+        {
+          "expr": "sum(rate(istio_request_duration_milliseconds_sum{destination_service_name=~\"api-$api_name.+\"}[1m])) by (destination_service_name) / sum(rate(istio_request_duration_milliseconds_count{destination_service_name=~\"api-$api_name.+\"}[1m])) by (destination_service_name)",
+          "hide": false,
+          "interval": "",
+          "legendFormat": "{{destination_service_name}}",
+          "refId": "D"
+        }
+      ],
+      "thresholds": [],
+      "timeFrom": null,
+      "timeRegions": [],
+      "timeShift": null,
+      "title": "Average Latency",
+      "tooltip": {
+        "shared": true,
+        "sort": 0,
+        "value_type": "individual"
+      },
+      "transformations": [
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "([^\\.]+)\\..+",
+            "renamePattern": "$1"
+          }
+        },
+        {
+          "id": "renameByRegex",
+          "options": {
+            "regex": "api-(.*)",
+            "renamePattern": "$1"
+          }
+        }
+      ],
+      "type": "graph",
+      "xaxis": {
+        "buckets": null,
+        "mode": "time",
+        "name": null,
+        "show": true,
+        "values": []
+      },
+      "yaxes": [
+        {
+          "format": "ms",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": "0",
+          "show": true
+        },
+        {
+          "format": "short",
+          "label": null,
+          "logBase": 1,
+          "max": null,
+          "min": null,
+          "show": true
+        }
+      ],
+      "yaxis": {
+        "align": false,
+        "alignLevel": null
+      }
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 27,
+  "style": "dark",
+  "tags": [],
+  "templating": {
+    "list": [
+      {
+        "allValue": null,
+        "current": {
+          "selected": true,
+          "tags": [],
+          "text": [
+            "autoscaling",
+            "iris-classifier"
+          ],
+          "value": [
+            "autoscaling",
+            "iris-classifier"
+          ]
+        },
+        "datasource": null,
+        "definition": "label_values(cortex_in_flight_requests{api_kind=\"RealtimeAPI\"}, api_name)",
+        "description": null,
+        "error": null,
+        "hide": 0,
+        "includeAll": false,
+        "label": "API Name",
+        "multi": true,
+        "name": "api_name",
+        "options": [
+          {
+            "selected": true,
+            "text": "autoscaling",
+            "value": "autoscaling"
+          },
+          {
+            "selected": true,
+            "text": "iris-classifier",
+            "value": "iris-classifier"
+          }
+        ],
+        "query": {
+          "query": "label_values(cortex_in_flight_requests{api_kind=\"RealtimeAPI\"}, api_name)",
+          "refId": "StandardVariableQuery"
+        },
+        "refresh": 0,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 0,
+        "tagValuesQuery": "",
+        "tags": [],
+        "tagsQuery": "",
+        "type": "query",
+        "useTags": false
+      }
+    ]
+  },
+  "time": {
+    "from": "now-1h",
+    "to": "now"
+  },
+  "timepicker": {},
+  "timezone": "",
+  "title": "RealtimeAPI",
+  "uid": "xvWFsZPGk",
+  "version": 5
+}
diff --git a/charts/templates/grafana.yaml b/charts/templates/grafana.yaml
new file mode 100644
index 0000000000..0b221d8fdb
--- /dev/null
+++ b/charts/templates/grafana.yaml
@@ -0,0 +1,226 @@
+# Copyright 2021 Cortex Labs, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+{{- if .Values.addons.grafana.enabled }}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-datasources
+  namespace: {{ .Release.Namespace }}
+data:
+  datasources.yaml: |
+    {
+        "apiVersion": 1,
+        "datasources": [
+            {
+                "access": "proxy",
+                "editable": false,
+                "name": "prometheus",
+                "orgId": 1,
+                "type": "prometheus",
+                "url": "http://prometheus.{{.Release.Namespace}}:9090",
+                "version": 1,
+                "isDefault": true
+            }
+        ]
+    }
+
+---
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboards
+  namespace: {{ .Release.Namespace }}
+data:
+  dashboards.yaml: |-
+    {
+        "apiVersion": 1,
+        "providers": [
+            {
+                "folder": "Cortex",
+                "name": "Cortex",
+                "options": {
+                    "path": "/grafana-dashboard-definitions/cortex"
+                },
+                "disableDeletion": true,
+                "orgId": 1,
+                "type": "file"
+            }
+        ]
+    }
+
+---
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-realtime
+  namespace: {{ .Release.Namespace }}
+data:
+  realtime.json: |-
+{{ .Files.Get "dashboards/realtime.json" | indent 4 }}
+
+---
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: grafana-dashboard-batch
+  namespace: {{ .Release.Namespace }}
+data:
+  batch.json: |-
+{{ .Files.Get "dashboards/batch.json" | indent 4 }}
+
+---
+
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: grafana-storage
+  namespace: {{ .Release.Namespace }}
+spec:
+  storageClassName: ssd
+  accessModes:
+    - ReadWriteOnce
+  resources:
+    requests:
+      storage: 2Gi
+
+---
+
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  labels:
+    app: grafana
+  name: grafana
+  namespace: {{ .Release.Namespace }}
+spec:
+  serviceName: grafana
+  replicas: 1
+  selector:
+    matchLabels:
+      app: grafana
+  template:
+    metadata:
+      labels:
+        app: grafana
+    spec:
+      containers:
+        - image: {{ .Values.cortex.image_grafana }}
+          name: grafana
+          ports:
+            - containerPort: 3000
+              name: http
+          readinessProbe:
+            httpGet:
+              path: /api/health
+              port: http
+          resources:
+            limits:
+              cpu: 200m
+              memory: 200Mi
+            requests:
+              cpu: 100m
+              memory: 100Mi
+          env:
+            - name: GF_SERVER_ROOT_URL
+              value: "%(protocol)s://%(domain)s:%(http_port)s/dashboard"
+            - name: GF_SERVER_SERVE_FROM_SUB_PATH
+              value: "true"
+            - name: GF_USERS_DEFAULT_THEME
+              value: "light"
+          volumeMounts:
+            - mountPath: /var/lib/grafana
+              name: grafana-storage
+              readOnly: false
+            - mountPath: /etc/grafana/provisioning/datasources
+              name: grafana-datasources
+              readOnly: false
+            - mountPath: /etc/grafana/provisioning/dashboards
+              name: grafana-dashboards
+              readOnly: false
+            - mountPath: /grafana-dashboard-definitions/cortex/realtime
+              name: grafana-dashboard-realtime
+              readOnly: false
+            - mountPath: /grafana-dashboard-definitions/cortex/batch
+              name: grafana-dashboard-batch
+              readOnly: false
+      securityContext:
+        fsGroup: 65534
+        runAsNonRoot: true
+        runAsUser: 65534
+      volumes:
+        - name: grafana-storage
+          persistentVolumeClaim:
+            claimName: grafana-storage
+        - name: grafana-datasources
+          configMap:
+            name: grafana-datasources
+        - name: grafana-dashboards
+          configMap:
+            name: grafana-dashboards
+        - name: grafana-dashboard-realtime
+          configMap:
+            name: grafana-dashboard-realtime
+        - name: grafana-dashboard-batch
+          configMap:
+            name: grafana-dashboard-batch
+
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    app: grafana
+  name: grafana
+  namespace: {{ .Release.Namespace }}
+spec:
+  type: ClusterIP
+  ports:
+    - name: http
+      port: 3000
+      targetPort: http
+  selector:
+    app: grafana
+
+---
+
+apiVersion: networking.istio.io/v1beta1
+kind: VirtualService
+metadata:
+  name: grafana
+  namespace: {{ .Release.Namespace }}
+spec:
+  hosts:
+    - "*"
+  gateways:
+    - operator-gateway
+  http:
+    - name: grafana
+      match:
+        - uri:
+            prefix: "/dashboard"
+        - uri:
+            prefix: "/grafana"
+      rewrite:
+        uri: "/dashboard"
+      route:
+        - destination:
+            host: grafana
+            port:
+              number: 3000
+{{- end }}
diff --git a/charts/templates/prometheus-monitoring.yaml b/charts/templates/prometheus-monitoring.yaml
index 586999bca1..88fbd26775 100644
--- a/charts/templates/prometheus-monitoring.yaml
+++ b/charts/templates/prometheus-monitoring.yaml
@@ -45,24 +45,7 @@ spec:
     fsGroup: 2000
     runAsNonRoot: true
     runAsUser: 1000
-{{- if eq .Values.global.provider "gcp" }}
-  containers:
-    - name: stackdriver-sidecar
-      image: {{ .Values.cortex.image_prometheus_stackdriver_sidecar }}
-      imagePullPolicy: Always
-      args:
-      - --stackdriver.project-id={{ .Values.cortex.project }}
-      - --prometheus.wal-directory=/data/prometheus-db/wal
-      - --stackdriver.kubernetes.location={{ .Values.cortex.zone  }}
-      - --stackdriver.kubernetes.cluster-name={{ .Values.cortex.cluster_name  }}
-      - --include={job=~"{{ .Release.Namespace }}/.*",__name__=~"cortex.*"}
-      ports:
-      - name: sidecar
-        containerPort: 9091
-      volumeMounts:
-      - mountPath: /data
-        name: prometheus-prometheus-db
-{{- end }}
+
 ---
 
 apiVersion: v1
diff --git a/charts/templates/prometheus-to-cloudwatch.yaml b/charts/templates/prometheus-to-cloudwatch.yaml
deleted file mode 100644
index 240bf77b85..0000000000
--- a/charts/templates/prometheus-to-cloudwatch.yaml
+++ /dev/null
@@ -1,47 +0,0 @@
-{{- if eq .Values.global.provider "aws" }}
-apiVersion: v1
-kind: ConfigMap
-metadata:
-  name: prometheus-cwconfig
-  namespace: {{ .Release.Namespace }}
-data:
-  CLOUDWATCH_NAMESPACE: "cortex"
-  CLOUDWATCH_REGION: "{{ .Values.cortex.region }}"
-  CLOUDWATCH_PUBLISH_TIMEOUT: "10"
-  PROMETHEUS_SCRAPE_INTERVAL: "15"
-  PROMETHEUS_SCRAPE_URL: http://prometheus.{{ .Release.Namespace }}:9090/federate?match[]={job=~"{{ .Values.cortex.namespace }}/.*",__name__=~"cortex.*"}
-  INCLUDE_METRICS: cortex_*
-  EXCLUDE_DIMENSIONS_FOR_METRICS: cortex_*=container,endpoint,instance,job,namespace,pod,prometheus,prometheus_replica
-
----
-
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: prometheus-to-cloudwatch
-  namespace: {{ .Release.Namespace }}
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      app: prometheus-to-cloudwatch
-  template:
-    metadata:
-      labels:
-        app: prometheus-to-cloudwatch
-    spec:
-      containers:
-        - name: prometheus-to-cloudwatch
-          image: {{ .Values.cortex.image_prometheus_to_cloudwatch }}
-          imagePullPolicy: Always
-          envFrom:
-            - configMapRef:
-                name: prometheus-cwconfig
-            - secretRef:
-                name: aws-credentials
-          resources:
-            requests:
-              cpu: 100m
-              memory: 150Mi
----
-{{- end }}
diff --git a/charts/values.yaml b/charts/values.yaml
index 162af78643..44838cf46d 100644
--- a/charts/values.yaml
+++ b/charts/values.yaml
@@ -25,11 +25,10 @@ cortex:
   image_istio_pilot: quay.io/cortexlabs/istio-pilot:master
   image_google_pause: quay.io/cortexlabs/pause:master
   image_prometheus: quay.io/cortexlabs/prometheus:master
-  image_prometheus_statsd_exporter: quay.io/cortexlabs/prometheus-statsd-exporter:master
-  image_prometheus_to_cloudwatch: quay.io/cortexlabs/prometheus-to-cloudwatch:master
-  image_prometheus_stackdriver_sidecar: quay.io/cortexlabs/prometheus-stackdriver-sidecar:master
   image_prometheus_config_reloader: quay.io/cortexlabs/prometheus-config-reloader:master
   image_prometheus_operator: quay.io/cortexlabs/prometheus-operator:master
+  image_prometheus_statsd_exporter: quay.io/cortexlabs/prometheus-statsd-exporter:master
+  image_grafana: quay.io/cortexlabs/grafana:master
 
 networking:
   istio-discovery:
@@ -53,3 +52,7 @@ global:
 
   proxy:
     image: istio-proxy
+
+addons:
+  grafana:
+    enabled: true

From 46d198826e05c5ab5c128728d241cae945a2fe0e Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Tue, 9 Feb 2021 14:30:55 +0100
Subject: [PATCH 17/26] Fix grafana refresh on templates

---
 charts/dashboards/batch.json                  | 21 ++-----------
 charts/dashboards/realtime.json               | 30 +++----------------
 .../grafana/grafana-dashboard-batch.yaml      | 21 ++-----------
 .../grafana/grafana-dashboard-realtime.yaml   | 30 +++----------------
 4 files changed, 14 insertions(+), 88 deletions(-)

diff --git a/charts/dashboards/batch.json b/charts/dashboards/batch.json
index d5d238ab53..946eed6908 100644
--- a/charts/dashboards/batch.json
+++ b/charts/dashboards/batch.json
@@ -347,16 +347,7 @@
     "list": [
       {
         "allValue": null,
-        "current": {
-          "selected": true,
-          "tags": [],
-          "text": [
-            "image-classifier"
-          ],
-          "value": [
-            "image-classifier"
-          ]
-        },
+        "current": {},
         "datasource": null,
         "definition": "label_values({__name__=~\"cortex_batch_.+\"}, api_name)",
         "description": null,
@@ -366,18 +357,12 @@
         "label": "API Name",
         "multi": true,
         "name": "api_name",
-        "options": [
-          {
-            "selected": true,
-            "text": "image-classifier",
-            "value": "image-classifier"
-          }
-        ],
+        "options": [],
         "query": {
           "query": "label_values({__name__=~\"cortex_batch_.+\"}, api_name)",
           "refId": "StandardVariableQuery"
         },
-        "refresh": 0,
+        "refresh": 1,
         "regex": "",
         "skipUrlSync": false,
         "sort": 0,
diff --git a/charts/dashboards/realtime.json b/charts/dashboards/realtime.json
index 61c851cbe5..c5b077bd7e 100644
--- a/charts/dashboards/realtime.json
+++ b/charts/dashboards/realtime.json
@@ -1160,18 +1160,7 @@
     "list": [
       {
         "allValue": null,
-        "current": {
-          "selected": true,
-          "tags": [],
-          "text": [
-            "autoscaling",
-            "iris-classifier"
-          ],
-          "value": [
-            "autoscaling",
-            "iris-classifier"
-          ]
-        },
+        "current": {},
         "datasource": null,
         "definition": "label_values(cortex_in_flight_requests{api_kind=\"RealtimeAPI\"}, api_name)",
         "description": null,
@@ -1181,23 +1170,12 @@
         "label": "API Name",
         "multi": true,
         "name": "api_name",
-        "options": [
-          {
-            "selected": true,
-            "text": "autoscaling",
-            "value": "autoscaling"
-          },
-          {
-            "selected": true,
-            "text": "iris-classifier",
-            "value": "iris-classifier"
-          }
-        ],
+        "options": [],
         "query": {
           "query": "label_values(cortex_in_flight_requests{api_kind=\"RealtimeAPI\"}, api_name)",
           "refId": "StandardVariableQuery"
         },
-        "refresh": 0,
+        "refresh": 1,
         "regex": "",
         "skipUrlSync": false,
         "sort": 0,
@@ -1217,5 +1195,5 @@
   "timezone": "",
   "title": "RealtimeAPI",
   "uid": "xvWFsZPGk",
-  "version": 5
+  "version": 1
 }
diff --git a/manager/manifests/grafana/grafana-dashboard-batch.yaml b/manager/manifests/grafana/grafana-dashboard-batch.yaml
index 3293750bd7..2649a406c0 100644
--- a/manager/manifests/grafana/grafana-dashboard-batch.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-batch.yaml
@@ -368,16 +368,7 @@ data:
         "list": [
           {
             "allValue": null,
-            "current": {
-              "selected": true,
-              "tags": [],
-              "text": [
-                "image-classifier"
-              ],
-              "value": [
-                "image-classifier"
-              ]
-            },
+            "current": {},
             "datasource": null,
             "definition": "label_values({__name__=~\"cortex_batch_.+\"}, api_name)",
             "description": null,
@@ -387,18 +378,12 @@ data:
             "label": "API Name",
             "multi": true,
             "name": "api_name",
-            "options": [
-              {
-                "selected": true,
-                "text": "image-classifier",
-                "value": "image-classifier"
-              }
-            ],
+            "options": [],
             "query": {
               "query": "label_values({__name__=~\"cortex_batch_.+\"}, api_name)",
               "refId": "StandardVariableQuery"
             },
-            "refresh": 0,
+            "refresh": 1,
             "regex": "",
             "skipUrlSync": false,
             "sort": 0,
diff --git a/manager/manifests/grafana/grafana-dashboard-realtime.yaml b/manager/manifests/grafana/grafana-dashboard-realtime.yaml
index 04cd941180..52c7f98444 100644
--- a/manager/manifests/grafana/grafana-dashboard-realtime.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-realtime.yaml
@@ -1181,18 +1181,7 @@ data:
         "list": [
           {
             "allValue": null,
-            "current": {
-              "selected": true,
-              "tags": [],
-              "text": [
-                "autoscaling",
-                "iris-classifier"
-              ],
-              "value": [
-                "autoscaling",
-                "iris-classifier"
-              ]
-            },
+            "current": {},
             "datasource": null,
             "definition": "label_values(cortex_in_flight_requests{api_kind=\"RealtimeAPI\"}, api_name)",
             "description": null,
@@ -1202,23 +1191,12 @@ data:
             "label": "API Name",
             "multi": true,
             "name": "api_name",
-            "options": [
-              {
-                "selected": true,
-                "text": "autoscaling",
-                "value": "autoscaling"
-              },
-              {
-                "selected": true,
-                "text": "iris-classifier",
-                "value": "iris-classifier"
-              }
-            ],
+            "options": [],
             "query": {
               "query": "label_values(cortex_in_flight_requests{api_kind=\"RealtimeAPI\"}, api_name)",
               "refId": "StandardVariableQuery"
             },
-            "refresh": 0,
+            "refresh": 1,
             "regex": "",
             "skipUrlSync": false,
             "sort": 0,
@@ -1238,5 +1216,5 @@ data:
       "timezone": "",
       "title": "RealtimeAPI",
       "uid": "xvWFsZPGk",
-      "version": 5
+      "version": 1
     }

From fb5be0890181a1a932f35d64d27048064fce7c46 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Tue, 9 Feb 2021 15:30:34 +0100
Subject: [PATCH 18/26] Improve metrics.md docs

---
 docs/workloads/realtime/metrics.md | 39 ++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/docs/workloads/realtime/metrics.md b/docs/workloads/realtime/metrics.md
index 005ad62ba3..2ba1e43772 100644
--- a/docs/workloads/realtime/metrics.md
+++ b/docs/workloads/realtime/metrics.md
@@ -39,6 +39,14 @@ The `cortex get API_NAME` command also provides a link to a Grafana dashboard:
 
 The dashboard is displayed once you run a `cortex get <api_name>` command.
 
+Alternatively, you can access it on `http://<operator_url>/dashboard`. 
+Run the following command to get the operator URL:
+
+```shell
+cortex env list
+```
+
+
 ### Default credentials
 
 The dashboard is protected with username / password authentication, which by default are:
@@ -50,3 +58,34 @@ You will be prompted to change the admin user password in the first time you log
 
 Grafana allows managing the access of several users and managing teams. For more information on this topic check
 the [grafana documentation](https://grafana.com/docs/grafana/latest/manage-users/).
+
+### Selecting an API
+
+You can select one or more APIs to visualize in the top left corner of the dashboard.
+
+![](https://user-images.githubusercontent.com/7456627/107375721-57545180-6ae9-11eb-9474-ba58ad7eb0c5.png)
+
+### Selecting a time range
+
+Grafana allows you to select a time range on which the metrics will be visualized.
+You can do so in the top right corner of the dashboard.
+
+![](https://user-images.githubusercontent.com/7456627/107376148-d9dd1100-6ae9-11eb-8c2b-c678b41ade01.png)
+
+**Note: Cortex only retains a maximum of 2 weeks worth of data at any moment in time**
+
+### Available dashboards
+
+There are more than one dashboard available by default. 
+You can view the available dashboards by accessing the Grafana menu: 
+`Dashboards -> Manage -> Cortex folder`.
+
+## Exposed metrics
+
+Cortex exposes more metrics with Prometheus, that can be potentially useful. 
+To check the available metrics, access the `Explore` menu in grafana and press 
+the `Metrics` button.
+
+![](https://user-images.githubusercontent.com/7456627/107377492-515f7000-6aeb-11eb-9b46-909120335060.png)
+
+You can use any of these metrics to set up your own dashboards.

From 1e9387043058ec4e4ef5b1bd81d44b8699157545 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Tue, 9 Feb 2021 15:34:32 +0100
Subject: [PATCH 19/26] Fix linting errors

---
 docs/workloads/realtime/metrics.md | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/docs/workloads/realtime/metrics.md b/docs/workloads/realtime/metrics.md
index 2ba1e43772..b40a127182 100644
--- a/docs/workloads/realtime/metrics.md
+++ b/docs/workloads/realtime/metrics.md
@@ -39,14 +39,13 @@ The `cortex get API_NAME` command also provides a link to a Grafana dashboard:
 
 The dashboard is displayed once you run a `cortex get <api_name>` command.
 
-Alternatively, you can access it on `http://<operator_url>/dashboard`. 
-Run the following command to get the operator URL:
+Alternatively, you can access it on `http://<operator_url>/dashboard`. Run the following command to get the operator
+URL:
 
 ```shell
 cortex env list
 ```
 
-
 ### Default credentials
 
 The dashboard is protected with username / password authentication, which by default are:
@@ -67,8 +66,8 @@ You can select one or more APIs to visualize in the top left corner of the dashb
 
 ### Selecting a time range
 
-Grafana allows you to select a time range on which the metrics will be visualized.
-You can do so in the top right corner of the dashboard.
+Grafana allows you to select a time range on which the metrics will be visualized. You can do so in the top right corner
+of the dashboard.
 
 ![](https://user-images.githubusercontent.com/7456627/107376148-d9dd1100-6ae9-11eb-8c2b-c678b41ade01.png)
 
@@ -76,15 +75,13 @@ You can do so in the top right corner of the dashboard.
 
 ### Available dashboards
 
-There are more than one dashboard available by default. 
-You can view the available dashboards by accessing the Grafana menu: 
-`Dashboards -> Manage -> Cortex folder`.
+There are more than one dashboard available by default. You can view the available dashboards by accessing the Grafana
+menu: `Dashboards -> Manage -> Cortex folder`.
 
 ## Exposed metrics
 
-Cortex exposes more metrics with Prometheus, that can be potentially useful. 
-To check the available metrics, access the `Explore` menu in grafana and press 
-the `Metrics` button.
+Cortex exposes more metrics with Prometheus, that can be potentially useful. To check the available metrics, access
+the `Explore` menu in grafana and press the `Metrics` button.
 
 ![](https://user-images.githubusercontent.com/7456627/107377492-515f7000-6aeb-11eb-9b46-909120335060.png)
 

From dc4c507f750bedb98a5894933975a794215041c9 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Tue, 9 Feb 2021 16:03:46 +0100
Subject: [PATCH 20/26] Update versions.md

---
 dev/versions.md | 269 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 182 insertions(+), 87 deletions(-)

diff --git a/dev/versions.md b/dev/versions.md
index 20f64304c3..ad817b3e98 100644
--- a/dev/versions.md
+++ b/dev/versions.md
@@ -8,10 +8,10 @@
 * check pod -> cluster autoscaling on cpu or gpu or inferentia
 * check cluster autoscaling on cpu and gpu and inferentia
 * examples
-  * check logs, predictions
-  * check metrics, tracker
-  * make sure to try all 8 base images (tf/onnx/py gpu/cpu, tf/py inferentia)
-  * confirm GPUs are used when requested
+    * check logs, predictions
+    * check metrics, tracker
+    * make sure to try all 8 base images (tf/onnx/py gpu/cpu, tf/py inferentia)
+    * confirm GPUs are used when requested
 
 ## eksctl
 
@@ -19,11 +19,13 @@
 1. Update the version in `manager/Dockerfile`
 1. Update `generate_eks.py` as necessary
 1. Check that `eksctl utils write-kubeconfig` log filter still behaves as desired
-1. Update eksctl on your dev machine: `curl --location "https://github.com/weaveworks/eksctl/releases/download/0.27.0/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp && sudo mv -f /tmp/eksctl /usr/local/bin`
+1. Update eksctl on your dev
+   machine: `curl --location "https://github.com/weaveworks/eksctl/releases/download/0.27.0/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp && sudo mv -f /tmp/eksctl /usr/local/bin`
 
 ## Kubernetes
 
-1. Find the latest version of Kubernetes supported by eksctl ([source code](https://github.com/weaveworks/eksctl/blob/master/pkg/apis/eksctl.io/v1alpha5/types.go))
+1. Find the latest version of Kubernetes supported by
+   eksctl ([source code](https://github.com/weaveworks/eksctl/blob/master/pkg/apis/eksctl.io/v1alpha5/types.go))
 1. Update the version in `generate_eks.py`
 1. See instructions for upgrading the Kubernetes client below
 
@@ -42,28 +44,30 @@
 
 ## Go
 
-1. Find the latest release on Golang's [release page](https://golang.org/doc/devel/release.html) (or [downloads page](https://golang.org/dl/)) and check the changelog
+1. Find the latest release on Golang's [release page](https://golang.org/doc/devel/release.html) (
+   or [downloads page](https://golang.org/dl/)) and check the changelog
 1. Search the codebase for the current minor version (e.g. `1.14`), update versions as appropriate
 1. Update your local version and alert developers:
-   * Linux:
-     1. `wget https://dl.google.com/go/go1.14.7.linux-amd64.tar.gz`
-     1. `tar -xvf go1.14.7.linux-amd64.tar.gz`
-     1. `sudo rm -rf /usr/local/go`
-     1. `sudo mv -f go /usr/local`
-     1. `rm go1.14.7.linux-amd64.tar.gz`
-     1. refresh shell
-     1. `go version`
-   * Mac:
-     1. `brew upgrade go` or `brew install go@1.14`
-     1. refresh shell
-     1. `go version`
+    * Linux:
+        1. `wget https://dl.google.com/go/go1.14.7.linux-amd64.tar.gz`
+        1. `tar -xvf go1.14.7.linux-amd64.tar.gz`
+        1. `sudo rm -rf /usr/local/go`
+        1. `sudo mv -f go /usr/local`
+        1. `rm go1.14.7.linux-amd64.tar.gz`
+        1. refresh shell
+        1. `go version`
+    * Mac:
+        1. `brew upgrade go` or `brew install go@1.14`
+        1. refresh shell
+        1. `go version`
 1. Update go modules as necessary
 
 ## Go modules
 
 ### Kubernetes client
 
-1. Find the latest patch release for the minor kubernetes version that EKS uses by default (here are [their versions](https://docs.aws.amazon.com/eks/latest/userguide/kubernetes-versions.html))
+1. Find the latest patch release for the minor kubernetes version that EKS uses by default (here
+   are [their versions](https://docs.aws.amazon.com/eks/latest/userguide/kubernetes-versions.html))
 1. Follow the "Update non-versioned modules" instructions using the updated version for `k8s.io/client-go`
 
 ### Istio client
@@ -76,11 +80,13 @@
 1. Find the latest tag from [releases](https://github.com/docker/engine/releases)
 1. Follow the "Update non-versioned modules" instructions using the updated version for `docker/engine`
 
-_note: docker client installation may be able to be improved, see https://github.com/moby/moby/issues/39302#issuecomment-639687466_
+_note: docker client installation may be able to be improved,
+see https://github.com/moby/moby/issues/39302#issuecomment-639687466_
 
 ### cortexlabs/yaml
 
-1. Check [go-yaml/yaml](https://github.com/go-yaml/yaml/commits/v2) to see if there were new releases since [cortexlabs/yaml](https://github.com/cortexlabs/yaml/commits/v2)
+1. Check [go-yaml/yaml](https://github.com/go-yaml/yaml/commits/v2) to see if there were new releases
+   since [cortexlabs/yaml](https://github.com/cortexlabs/yaml/commits/v2)
 1. `git clone git@github.com:cortexlabs/yaml.git && cd yaml`
 1. `git remote add upstream https://github.com/go-yaml/yaml && git fetch upstream`
 1. `git merge upstream/v2`
@@ -89,7 +95,8 @@ _note: docker client installation may be able to be improved, see https://github
 
 ### cortexlabs/go-input
 
-1. Check [tcnksm/go-input](https://github.com/tcnksm/go-input/commits/master) to see if there were new releases since [cortexlabs/go-input](https://github.com/cortexlabs/go-input/commits/master)
+1. Check [tcnksm/go-input](https://github.com/tcnksm/go-input/commits/master) to see if there were new releases
+   since [cortexlabs/go-input](https://github.com/cortexlabs/go-input/commits/master)
 1. `git clone git@github.com:cortexlabs/go-input.git && cd go-input`
 1. `git remote add upstream https://github.com/tcnksm/go-input && git fetch upstream`
 1. `git merge upstream/master`
@@ -124,7 +131,14 @@ _note: docker client installation may be able to be improved, see https://github
 
 The same Python version should be used throughout Cortex (e.g. search for `3.6` and update all accordingly).
 
-It's probably safest to use the minor version of Python that you get when you run `apt-get install python3` ([currently that's what TensorFlow's Docker image does](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile)), or what you get by default in Google CoLab. In theory, it should be safe to use the lowest of the maximum supported python versions in our pip dependencies (e.g. [tensorflow](https://pypi.org/project/tensorflow), [Keras](https://pypi.org/project/Keras), [numpy](https://pypi.org/project/numpy), [pandas](https://pypi.org/project/pandas), [scikit-learn](https://pypi.org/project/scikit-learn), [scipy](https://pypi.org/project/scipy), [torch](https://pypi.org/project/torch), [xgboost](https://pypi.org/project/xgboost))
+It's probably safest to use the minor version of Python that you get when you
+run `apt-get install python3` ([currently that's what TensorFlow's Docker image does](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile))
+, or what you get by default in Google CoLab. In theory, it should be safe to use the lowest of the maximum supported
+python versions in our pip dependencies (e.g. [tensorflow](https://pypi.org/project/tensorflow)
+, [Keras](https://pypi.org/project/Keras), [numpy](https://pypi.org/project/numpy)
+, [pandas](https://pypi.org/project/pandas), [scikit-learn](https://pypi.org/project/scikit-learn)
+, [scipy](https://pypi.org/project/scipy), [torch](https://pypi.org/project/torch)
+, [xgboost](https://pypi.org/project/xgboost))
 
 ## TensorFlow / TensorFlow Serving
 
@@ -135,71 +149,111 @@ Note: it's ok if example training notebooks aren't upgraded, as long as the expo
 
 ## CUDA
 
-1. Update the `nvidia/cuda` base image in `images/python-predictor-gpu/Dockerfile` and `images/onnx-predictor-gpu/Dockerfile` (as well as `libnvinfer` in `images/python-predictor-gpu/Dockerfile` and `images/tensorflow-serving-gpu/Dockerfile`) to the desired version based on [TensorFlow's documentation](https://www.tensorflow.org/install/gpu) / [TensorFlow's compatability table](https://www.tensorflow.org/install/source#gpu) ([Dockerhub](https://hub.docker.com/r/nvidia/cuda)) (it's possible these versions will diverge depending on ONNX runtime support)
+1. Update the `nvidia/cuda` base image in `images/python-predictor-gpu/Dockerfile`
+   and `images/onnx-predictor-gpu/Dockerfile` (as well as `libnvinfer` in `images/python-predictor-gpu/Dockerfile`
+   and `images/tensorflow-serving-gpu/Dockerfile`) to the desired version based
+   on [TensorFlow's documentation](https://www.tensorflow.org/install/gpu)
+   / [TensorFlow's compatability table](https://www.tensorflow.org/install/source#gpu) ([Dockerhub](https://hub.docker.com/r/nvidia/cuda)) (
+   it's possible these versions will diverge depending on ONNX runtime support)
 
 ## ONNX runtime
 
-1. Update the version in `images/onnx-predictor-cpu/Dockerfile` and `images/onnx-predictor-gpu/Dockerfile` ([releases](https://github.com/microsoft/onnxruntime/releases))
+1. Update the version in `images/onnx-predictor-cpu/Dockerfile`
+   and `images/onnx-predictor-gpu/Dockerfile` ([releases](https://github.com/microsoft/onnxruntime/releases))
 1. Search the codebase for the previous ONNX runtime version
 
 ## Nvidia device plugin
 
-1. Update the version in `images/nvidia/Dockerfile` ([releases](https://github.com/NVIDIA/k8s-device-plugin/releases), [Dockerhub](https://hub.docker.com/r/nvidia/k8s-device-plugin))
-1. In the [GitHub Repo](https://github.com/NVIDIA/k8s-device-plugin), find the latest release and go to this file (replacing the version number): <https://github.com/NVIDIA/k8s-device-plugin/blob/v0.6.0/nvidia-device-plugin.yml>
+1. Update the version in `images/nvidia/Dockerfile` ([releases](https://github.com/NVIDIA/k8s-device-plugin/releases)
+   , [Dockerhub](https://hub.docker.com/r/nvidia/k8s-device-plugin))
+1. In the [GitHub Repo](https://github.com/NVIDIA/k8s-device-plugin), find the latest release and go to this file (
+   replacing the version number): <https://github.com/NVIDIA/k8s-device-plugin/blob/v0.6.0/nvidia-device-plugin.yml>
 1. Copy the contents to `manager/manifests/nvidia.yaml`
-   1. Update the link at the top of the file to the URL you copied from
-   1. Check that your diff is reasonable (and put back any of our modifications, e.g. the image path, rolling update strategy, resource requests, tolerations, node selector, priority class, etc)
+    1. Update the link at the top of the file to the URL you copied from
+    1. Check that your diff is reasonable (and put back any of our modifications, e.g. the image path, rolling update
+       strategy, resource requests, tolerations, node selector, priority class, etc)
 1. Confirm GPUs work for PyTorch, TensorFlow, and ONNX models
 
 ## Inferentia device plugin
 
-1. Check if the image in [k8s-neuron-device-plugin.yml](https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/k8s-neuron-device-plugin.yml) has been updated (also check the readme in the parent directory to see if anything has changed). To check what the latest tag currently points to, run `aws ecr list-images --region us-west-2 --registry-id 790709498068 --repository-name neuron-device-plugin`, and then see which version has the same imageDigest as `latest`.
-1. Copy the contents of [k8s-neuron-device-plugin.yml](https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/k8s-neuron-device-plugin.yml) and [k8s-neuron-device-plugin-rbac.yml](https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/k8s-neuron-device-plugin-rbac.yml) to `manager/manifests/inferentia.yaml`
-   1. Update the links at the top of the file to the URL you copied from
-   1. Check that your diff is reasonable (and put back any of our modifications)
+1. Check if the image
+   in [k8s-neuron-device-plugin.yml](https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/k8s-neuron-device-plugin.yml)
+   has been updated (also check the readme in the parent directory to see if anything has changed). To check what the
+   latest tag currently points to,
+   run `aws ecr list-images --region us-west-2 --registry-id 790709498068 --repository-name neuron-device-plugin`, and
+   then see which version has the same imageDigest as `latest`.
+1. Copy the contents
+   of [k8s-neuron-device-plugin.yml](https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/k8s-neuron-device-plugin.yml)
+   and [k8s-neuron-device-plugin-rbac.yml](https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/k8s-neuron-device-plugin-rbac.yml)
+   to `manager/manifests/inferentia.yaml`
+    1. Update the links at the top of the file to the URL you copied from
+    1. Check that your diff is reasonable (and put back any of our modifications)
 
 ## Neuron
 
 1. `docker run --rm -it amazonlinux:2`
-1. Run the `echo $'[neuron] ...' > /etc/yum.repos.d/neuron.repo` command from [Dockerfile.neuron-rtd](https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/docker-example/Dockerfile.neuron-rtd) (it needs to be updated to work properly with the new lines)
-1. Run `yum info aws-neuron-tools` and `yum info aws-neuron-runtime` to check the versions that were installed, and use those versions in `images/neuron-rtd/Dockerfile`
-1. Check if there are any updates to [Dockerfile.neuron-rtd](https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/docker-example/Dockerfile.neuron-rtd) which should be brought in to `images/neuron-rtd/Dockerfile`
-1. Set the version of `aws-neuron-tools` and `aws-neuron-runtime` in `images/python-predictor-inf/Dockerfile` and `images/tensorflow-serving-inf/Dockerfile`
+1. Run the `echo $'[neuron] ...' > /etc/yum.repos.d/neuron.repo` command
+   from [Dockerfile.neuron-rtd](https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/docker-example/Dockerfile.neuron-rtd) (
+   it needs to be updated to work properly with the new lines)
+1. Run `yum info aws-neuron-tools` and `yum info aws-neuron-runtime` to check the versions that were installed, and use
+   those versions in `images/neuron-rtd/Dockerfile`
+1. Check if there are any updates
+   to [Dockerfile.neuron-rtd](https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/docker-example/Dockerfile.neuron-rtd)
+   which should be brought in to `images/neuron-rtd/Dockerfile`
+1. Set the version of `aws-neuron-tools` and `aws-neuron-runtime` in `images/python-predictor-inf/Dockerfile`
+   and `images/tensorflow-serving-inf/Dockerfile`
 1. Run `docker run --rm -it ubuntu:18.04`
-1. Run the first `RUN` command used in `images/tensorflow-serving-inf/Dockerfile`, having omitted the version specified for `tensorflow-model-server-neuron` and the cleanup line at the end
-1. Run `apt-cache policy tensorflow-model-server-neuron` to find the version that was installed, and update it in `images/tensorflow-serving-inf/Dockerfile`
-1. Check if there are any updates to [Dockerfile.tf-serving](https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/docker-example/Dockerfile.tf-serving) which should be brought in to `images/tensorflow-serving-inf/Dockerfile`
+1. Run the first `RUN` command used in `images/tensorflow-serving-inf/Dockerfile`, having omitted the version specified
+   for `tensorflow-model-server-neuron` and the cleanup line at the end
+1. Run `apt-cache policy tensorflow-model-server-neuron` to find the version that was installed, and update it
+   in `images/tensorflow-serving-inf/Dockerfile`
+1. Check if there are any updates
+   to [Dockerfile.tf-serving](https://github.com/aws/aws-neuron-sdk/blob/master/docs/neuron-container-tools/docker-example/Dockerfile.tf-serving)
+   which should be brought in to `images/tensorflow-serving-inf/Dockerfile`
 1. Run `docker run --rm -it ubuntu:18.04`
-1. Run `apt-get update && apt-get install -y curl python3.6 python3.6-distutils` (change the python version if necessary)
-1. Run `curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3.6 get-pip.py && pip install --upgrade pip` (change the python version if necessary)
+1. Run `apt-get update && apt-get install -y curl python3.6 python3.6-distutils` (change the python version if
+   necessary)
+1. Run `curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py && python3.6 get-pip.py && pip install --upgrade pip` (
+   change the python version if necessary)
 1. Run `pip install --extra-index-url https://pip.repos.neuron.amazonaws.com neuron-cc tensorflow-neuron torch-neuron`
-1. Run `pip list` to show the versions of all installed dependencies, and update `images/python-predictor-inf/Dockerfile` and the docs accordingly (`realtime-api/predictors.md` and `batch-api/predictors.md`); latest versions of dependencies that aren't shown in `pip list` can be determined on pypi.org (for `torchvision`, go to its pypi page, and use the latest patch version of the minor version which is appropriate for the version of `torch` that's installed)
+1. Run `pip list` to show the versions of all installed dependencies, and
+   update `images/python-predictor-inf/Dockerfile` and the docs accordingly (`realtime-api/predictors.md`
+   and `batch-api/predictors.md`); latest versions of dependencies that aren't shown in `pip list` can be determined on
+   pypi.org (for `torchvision`, go to its pypi page, and use the latest patch version of the minor version which is
+   appropriate for the version of `torch` that's installed)
 1. Take a deep breath, cross your fingers, rebuild all images, and confirm that the Inferentia examples work
 
 ## Python packages
 
-1. Update versions in `images/python-predictor-*/Dockerfile`, `images/tensorflow-predictor/Dockerfile`, and `images/onnx-predictor-*/Dockerfile`
+1. Update versions in `images/python-predictor-*/Dockerfile`, `images/tensorflow-predictor/Dockerfile`,
+   and `images/onnx-predictor-*/Dockerfile`
 1. Update versions in `pkg/cortex/serve/*requirements.txt` and `pkg/cortex/downloader/requirements.txt`
 1. Update the versions listed in "Pre-installed packages" in `realtime-api/predictors.md` and `batch-api/predictors.md`
     * look at the diff carefully since some packages are not shown, and e.g. `tensorflow-cpu` -> `tensorflow`
-    * be careful not to update any of the versions for Inferentia that are not latest in `images/python-predictor-inf/Dockerfile`
+    * be careful not to update any of the versions for Inferentia that are not latest
+      in `images/python-predictor-inf/Dockerfile`
 1. Rerun all examples and check their logs
 
 ## S6-overlay supervisor
 
-1. Locate the `s6-overlay` installation in `images/python-predictor-*/Dockerfile`, `images/tensorflow-predictor/Dockerfile` and `images/onnx-predictor-*/Dockerfile`
+1. Locate the `s6-overlay` installation in `images/python-predictor-*/Dockerfile`
+   , `images/tensorflow-predictor/Dockerfile` and `images/onnx-predictor-*/Dockerfile`
 1. Update the version in each serving image with the newer one in https://github.com/just-containers/s6-overlay.
 
 ## Nginx
 
-1. Run a base image of ubuntu that matches the version tag used for the serving images. The running command is `docker run -it --rm <base-image>`
+1. Run a base image of ubuntu that matches the version tag used for the serving images. The running command
+   is `docker run -it --rm <base-image>`
 1. Run `apt update && apt-cache policy nginx`. Notice the latest minor version of nginx (e.g. `1.14`)
-1. Locate the `nginx` package in `images/python-predictor-*/Dockerfile`, `images/tensorflow-predictor/Dockerfile` and `images/onnx-predictor-*/Dockerfile`
-1. Update the version for all `nginx` appearances using the minor version from step 2 and add an asterisk at the end to denote any version (e.g. `1.14.*`)
+1. Locate the `nginx` package in `images/python-predictor-*/Dockerfile`, `images/tensorflow-predictor/Dockerfile`
+   and `images/onnx-predictor-*/Dockerfile`
+1. Update the version for all `nginx` appearances using the minor version from step 2 and add an asterisk at the end to
+   denote any version (e.g. `1.14.*`)
 
 ## Istio
 
-1. Find the latest [release](https://istio.io/latest/news/releases) and check the release notes (here are the [latest IstioOperator Options](https://istio.io/latest/docs/reference/config/istio.operator.v1alpha1/))
+1. Find the latest [release](https://istio.io/latest/news/releases) and check the release notes (here are
+   the [latest IstioOperator Options](https://istio.io/latest/docs/reference/config/istio.operator.v1alpha1/))
 1. Update the version in `images/manager/Dockerfile`
 1. Update the version in all `images/istio-*` Dockerfiles
 1. Update `istio.yaml.j2`, `apis.yaml.j2`, `operator.yaml.j2`, and `pkg/lib/k8s` as necessary
@@ -207,56 +261,97 @@ Note: it's ok if example training notebooks aren't upgraded, as long as the expo
 
 ## Istio charts
 
-1. Download `curl -L https://istio.io/downloadIstio | ISTIO_VERSION=<ISTIO_VERSION_HERE> TARGET_ARCH=x86_64 sh -` and you will find manifests/charts containing helm charts.
-1. Copy the charts containing the istio crds, istio pilot and istio ingress gateway into manifests/charts/networking/charts. As of 1.7.3 these charts are in folders named: `base`, `istio-control/istio-discovery`, `gateways/istio-ingress`. Copy the istio-ingress folder twice except name one of them api-ingress and the other operator-ingress.
-1. Update manifests/charts/networking/values.yaml to override globals and default values.yaml in the istio charts as necessary
-1. Update template files in istio charts to propagate the necessary service annotations to ingress gateways based on config
-1. Test the helm charts for both aws and gcp provider `helm template testing manifests -n default --dry-run -f <values.yaml>` and verify that none of the resources are namespaced to any istio namespaces.
-
+1. Download `curl -L https://istio.io/downloadIstio | ISTIO_VERSION=<ISTIO_VERSION_HERE> TARGET_ARCH=x86_64 sh -` and
+   you will find manifests/charts containing helm charts.
+1. Copy the charts containing the istio crds, istio pilot and istio ingress gateway into
+   manifests/charts/networking/charts. As of 1.7.3 these charts are in folders named: `base`
+   , `istio-control/istio-discovery`, `gateways/istio-ingress`. Copy the istio-ingress folder twice except name one of
+   them api-ingress and the other operator-ingress.
+1. Update manifests/charts/networking/values.yaml to override globals and default values.yaml in the istio charts as
+   necessary
+1. Update template files in istio charts to propagate the necessary service annotations to ingress gateways based on
+   config
+1. Test the helm charts for both aws and gcp
+   provider `helm template testing manifests -n default --dry-run -f <values.yaml>` and verify that none of the
+   resources are namespaced to any istio namespaces.
 
 ## Google Pause
 
-1. Find the version of google pause used in the nvidia device driver yaml file referenced [here](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers)
+1. Find the version of google pause used in the nvidia device driver yaml file
+   referenced [here](https://cloud.google.com/kubernetes-engine/docs/how-to/gpus#installing_drivers)
 1. Update the version in `images/google-pause/Dockerfile`
 
 ## Metrics server
 
-1. Find the latest release on [GitHub](https://github.com/kubernetes-incubator/metrics-server/releases) and check the changelog
+1. Find the latest release on [GitHub](https://github.com/kubernetes-incubator/metrics-server/releases) and check the
+   changelog
 1. Update the version in `images/metrics-server/Dockerfile`
-1. In the [GitHub Repo](https://github.com/kubernetes-incubator/metrics-server), find the latest release and go to this directory (replacing the version number): <https://github.com/kubernetes-incubator/metrics-server/tree/v0.3.7/deploy/1.8+>
+1. In the [GitHub Repo](https://github.com/kubernetes-incubator/metrics-server), find the latest release and go to this
+   directory (replacing the version
+   number): <https://github.com/kubernetes-incubator/metrics-server/tree/v0.3.7/deploy/1.8+>
 1. Copy the contents of all of the files in that directory into `manager/manifests/metrics-server.yaml`
-   1. Update this line of config:
+    1. Update this line of config:
 
-       ```yaml
-       image: $CORTEX_IMAGE_METRICS_SERVER
-       ```
+        ```yaml
+        image: $CORTEX_IMAGE_METRICS_SERVER
+        ```
 
-   1. Update the link at the top of the file to the URL you copied from
-   1. Check that your diff is reasonable (there may have been other modifications to the file which should be preserved, like resource requests)
-1. You can confirm the metric server is running by showing the logs of the metrics-server pod, or via `kubectl get deployment metrics-server -n kube-system` and `kubectl get apiservice v1beta1.metrics.k8s.io -o yaml`
+    1. Update the link at the top of the file to the URL you copied from
+    1. Check that your diff is reasonable (there may have been other modifications to the file which should be
+       preserved, like resource requests)
+1. You can confirm the metric server is running by showing the logs of the metrics-server pod, or
+   via `kubectl get deployment metrics-server -n kube-system`
+   and `kubectl get apiservice v1beta1.metrics.k8s.io -o yaml`
 
-Note: overriding horizontal-pod-autoscaler-sync-period on EKS is currently not supported (<https://github.com/awslabs/amazon-eks-ami/issues/176>)
+Note: overriding horizontal-pod-autoscaler-sync-period on EKS is currently not
+supported (<https://github.com/awslabs/amazon-eks-ami/issues/176>)
 
 ## Cluster autoscaler
 
-1. Find the latest patch release for our current version of k8s (e.g. k8s v1.17 -> cluster-autocluster v1.17.3) on [GitHub](https://github.com/kubernetes/autoscaler/releases) and check the changelog
+1. Find the latest patch release for our current version of k8s (e.g. k8s v1.17 -> cluster-autocluster v1.17.3)
+   on [GitHub](https://github.com/kubernetes/autoscaler/releases) and check the changelog
 1. Update the base image in `images/cluster-autoscaler/Dockerfile` to the repository URL shown in the GitHub release
-1. In the [GitHub Repo](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws), set the tree to the tag for the chosen release, and open `cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml` (e.g. <https://github.com/kubernetes/autoscaler/blob/cluster-autoscaler-1.16.5/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml>)
+1. In the [GitHub Repo](https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/cloudprovider/aws), set
+   the tree to the tag for the chosen release, and
+   open `cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml` (
+   e.g. <https://github.com/kubernetes/autoscaler/blob/cluster-autoscaler-1.16.5/cluster-autoscaler/cloudprovider/aws/examples/cluster-autoscaler-autodiscover.yaml>)
 1. Resolve merge conflicts with the template in `manager/manifests/cluster-autoscaler.yaml.j2`
 
 ## FluentBit
 
-1. Find the latest release on [Dockerhub](https://hub.docker.com/r/amazon/aws-for-fluent-bit/tags?page=1&ordering=last_updated)
+1. Find the latest release
+   on [Docker Hub](https://hub.docker.com/r/amazon/aws-for-fluent-bit/tags?page=1&ordering=last_updated)
 1. Update the base image version in `images/fluent-bit/Dockerfile`
 1. Update `fluent-bit.yaml` as necessary (make sure to maintain all Cortex environment variables)
 
-## Statsd
+## Prometheus Operator / Prometheus Config Reloader
+
+1. Find the latest release in the [GitHub Repo](https://github.com/prometheus-operator/prometheus-operator).
+1. Copy the `bundle.yaml` file contents into `prometheus-operator.yaml`.
+1. Replace the image in the Deployment resource with a cortex env var.
+1. Update the base image versions in `images/prometheus-operator/Dockerfile`
+   and `images/prometheus-config-reloader/Dockerfile`.
+
+## Prometheus
+
+1. Find the latest release on [Docker Hub](https://hub.docker.com/r/prom/prometheus/tags?page=1&ordering=last_updated),
+   compatible to the current version of Prometheus Operator.
+1. Update the base image version in `images/prometheus/Dockerfile`.
+1. Update `prometheus-monitoring.yaml` as necessary, if that's the case.
+
+## Prometheus StatsD Exporter
+
+1. Find the latest release
+   on [Docker Hub](https://registry.hub.docker.com/r/prom/statsd-exporter/tags?page=1&ordering=last_updated).
+1. Update the base image version in `images/prometheus-statsd-exporter/Dockerfile`.
+1. Update `prometheus-statsd-exporter.yaml` as necessary, if that's the case.
+
+## Grafana
 
-1. Find the latest release on [Dockerhub](https://hub.docker.com/r/amazon/cloudwatch-agent/tags)
-1. Update the version in `images/statsd/Dockerfile`
-1. In this [GitHub Repo](https://github.com/aws-samples/amazon-cloudwatch-container-insights), set the tree to `master` and open [k8s-yaml-templates/cwagent-statsd/cwagent-statsd-daemonset.yaml](https://github.com/aws-samples/amazon-cloudwatch-container-insights/blob/master/k8s-yaml-templates/cwagent-statsd/cwagent-statsd-daemonset.yaml) and [k8s-yaml-templates/cwagent-statsd/cwagent-statsd-configmap.yaml](https://github.com/aws-samples/amazon-cloudwatch-container-insights/blob/master/k8s-yaml-templates/cwagent-statsd/cwagent-statsd-configmap.yaml)
-1. Update `statsd.yaml` as necessary (this wasn't copy-pasted, so you may need to check the diff intelligently)
-1. Update the datadog client version in `pkg/cortex/serve/requirements.txt`
+1. Find the latest release
+   on [Docker Hub](https://registry.hub.docker.com/r/grafana/grafana/tags?page=1&ordering=last_updated).
+1. Update the base image version in `images/grafana/Dockerfile`.
+1. Update `grafana.yaml` as necessary, if that's the case.
 
 ## aws-iam-authenticator
 
@@ -268,16 +363,16 @@ Note: overriding horizontal-pod-autoscaler-sync-period on EKS is currently not s
 1. Find the latest release [here](https://storage.googleapis.com/kubernetes-release/release/stable.txt)
 1. Update the version in `images/manager/Dockerfile` and `images/operator/Dockerfile`
 1. Update your local version and alert developers
-   * Linux:
-     1. `curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl`
-     1. `chmod +x ./kubectl`
-     1. `sudo mv -f ./kubectl /usr/local/bin/kubectl`
-     1. refresh shell
-     1. `kubectl version`
-   * Mac:
-     1. `brew upgrade kubernetes-cli`
-     1. refresh shell
-     1. `kubectl version`
+    * Linux:
+        1. `curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl`
+        1. `chmod +x ./kubectl`
+        1. `sudo mv -f ./kubectl /usr/local/bin/kubectl`
+        1. refresh shell
+        1. `kubectl version`
+    * Mac:
+        1. `brew upgrade kubernetes-cli`
+        1. refresh shell
+        1. `kubectl version`
 
 ## Ubuntu base images
 

From 9dfaf9734b4842d68e2302cf096302525d813904 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Wed, 10 Feb 2021 17:24:25 +0100
Subject: [PATCH 21/26] Add metrics to summary.md

---
 docs/summary.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/summary.md b/docs/summary.md
index 1cfbf5537c..fc974145b7 100644
--- a/docs/summary.md
+++ b/docs/summary.md
@@ -22,6 +22,7 @@
   * [Server-side batching](workloads/realtime/server-side-batching.md)
   * [Autoscaling](workloads/realtime/autoscaling.md)
   * [Statuses](workloads/realtime/statuses.md)
+  * [Metrics](workloads/realtime/metrics.md)
   * Multi-model
     * [Example](workloads/realtime/multi-model/example.md)
     * [Configuration](workloads/realtime/multi-model/configuration.md)

From e386da37a8aba39b1dd5b8fe52d61b90e7a97bfa Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Wed, 10 Feb 2021 17:26:46 +0100
Subject: [PATCH 22/26] Fix typo

---
 docs/workloads/realtime/metrics.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/workloads/realtime/metrics.md b/docs/workloads/realtime/metrics.md
index b40a127182..5848038352 100644
--- a/docs/workloads/realtime/metrics.md
+++ b/docs/workloads/realtime/metrics.md
@@ -37,7 +37,7 @@ The `cortex get API_NAME` command also provides a link to a Grafana dashboard:
 
 ## Accessing the dashboard
 
-The dashboard is displayed once you run a `cortex get <api_name>` command.
+The dashboard URL is displayed once you run a `cortex get <api_name>` command.
 
 Alternatively, you can access it on `http://<operator_url>/dashboard`. Run the following command to get the operator
 URL:

From 2a06bb1aeade2c53cf1917473e7b0d8ebffd1cba Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Wed, 10 Feb 2021 17:53:58 +0100
Subject: [PATCH 23/26] Add pod affinity between grafana and prometheus

---
 charts/templates/grafana.yaml          | 9 +++++++++
 manager/manifests/grafana/grafana.yaml | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/charts/templates/grafana.yaml b/charts/templates/grafana.yaml
index 0b221d8fdb..eb65e707cf 100644
--- a/charts/templates/grafana.yaml
+++ b/charts/templates/grafana.yaml
@@ -178,6 +178,15 @@ spec:
         - name: grafana-dashboard-batch
           configMap:
             name: grafana-dashboard-batch
+      affinity:
+        podAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - podAffinityTerm:
+                labelSelector:
+                  matchLabels:
+                    prometheus: prometheus
+                topologyKey: kubernetes.io/hostname
+              weight: 100
 
 ---
 
diff --git a/manager/manifests/grafana/grafana.yaml b/manager/manifests/grafana/grafana.yaml
index 0fbc509bf5..b7f7c41056 100644
--- a/manager/manifests/grafana/grafana.yaml
+++ b/manager/manifests/grafana/grafana.yaml
@@ -155,6 +155,15 @@ spec:
         - name: grafana-dashboard-batch
           configMap:
             name: grafana-dashboard-batch
+      affinity:
+        podAffinity:
+          preferredDuringSchedulingIgnoredDuringExecution:
+            - podAffinityTerm:
+                labelSelector:
+                  matchLabels:
+                    prometheus: prometheus
+                topologyKey: kubernetes.io/hostname
+              weight: 100
 
 ---
 

From 723e366be9f12dc9b3617e1406d1c2faa14d792b Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Wed, 10 Feb 2021 18:29:44 +0100
Subject: [PATCH 24/26] Rename grafana dashboards UIDs

---
 charts/dashboards/batch.json                              | 2 +-
 charts/dashboards/realtime.json                           | 2 +-
 manager/manifests/grafana/grafana-dashboard-batch.yaml    | 2 +-
 manager/manifests/grafana/grafana-dashboard-realtime.yaml | 2 +-
 pkg/operator/resources/realtimeapi/api.go                 | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/charts/dashboards/batch.json b/charts/dashboards/batch.json
index 946eed6908..a3ed008a91 100644
--- a/charts/dashboards/batch.json
+++ b/charts/dashboards/batch.json
@@ -381,6 +381,6 @@
   "timepicker": {},
   "timezone": "",
   "title": "BatchAPI",
-  "uid": "nEiYFWEMk",
+  "uid": "batchapi",
   "version": 1
 }
diff --git a/charts/dashboards/realtime.json b/charts/dashboards/realtime.json
index c5b077bd7e..616450dc7f 100644
--- a/charts/dashboards/realtime.json
+++ b/charts/dashboards/realtime.json
@@ -1194,6 +1194,6 @@
   "timepicker": {},
   "timezone": "",
   "title": "RealtimeAPI",
-  "uid": "xvWFsZPGk",
+  "uid": "realtimeapi",
   "version": 1
 }
diff --git a/manager/manifests/grafana/grafana-dashboard-batch.yaml b/manager/manifests/grafana/grafana-dashboard-batch.yaml
index 2649a406c0..2624506a9e 100644
--- a/manager/manifests/grafana/grafana-dashboard-batch.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-batch.yaml
@@ -402,6 +402,6 @@ data:
       "timepicker": {},
       "timezone": "",
       "title": "BatchAPI",
-      "uid": "nEiYFWEMk",
+      "uid": "batchapi",
       "version": 1
     }
diff --git a/manager/manifests/grafana/grafana-dashboard-realtime.yaml b/manager/manifests/grafana/grafana-dashboard-realtime.yaml
index 52c7f98444..2765077516 100644
--- a/manager/manifests/grafana/grafana-dashboard-realtime.yaml
+++ b/manager/manifests/grafana/grafana-dashboard-realtime.yaml
@@ -1215,6 +1215,6 @@ data:
       "timepicker": {},
       "timezone": "",
       "title": "RealtimeAPI",
-      "uid": "xvWFsZPGk",
+      "uid": "realtimeapi",
       "version": 1
     }
diff --git a/pkg/operator/resources/realtimeapi/api.go b/pkg/operator/resources/realtimeapi/api.go
index 759ea5d78d..92bcf7193a 100644
--- a/pkg/operator/resources/realtimeapi/api.go
+++ b/pkg/operator/resources/realtimeapi/api.go
@@ -38,7 +38,7 @@ import (
 	kcore "k8s.io/api/core/v1"
 )
 
-const _realtimeDashboardUID = "xvWFsZPGk"
+const _realtimeDashboardUID = "realtimeapi"
 
 var _autoscalerCrons = make(map[string]cron.Cron) // apiName -> cron
 

From d8919ffb642f866b2c9fb93b554f189494933b11 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Wed, 10 Feb 2021 18:30:15 +0100
Subject: [PATCH 25/26] Add dashboard URL to batch apis on the CLI

---
 cli/cmd/lib_batch_apis.go                  |  4 ++++
 pkg/operator/resources/job/batchapi/api.go | 20 ++++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/cli/cmd/lib_batch_apis.go b/cli/cmd/lib_batch_apis.go
index a80cfbf5a0..04641cab28 100644
--- a/cli/cmd/lib_batch_apis.go
+++ b/cli/cmd/lib_batch_apis.go
@@ -131,6 +131,10 @@ func batchAPITable(batchAPI schema.APIResponse) string {
 		out += t.MustFormat()
 	}
 
+	if batchAPI.DashboardURL != nil && *batchAPI.DashboardURL != "" {
+		out += "\n" + console.Bold("metrics dashboard: ") + *batchAPI.DashboardURL + "\n"
+	}
+
 	out += "\n" + console.Bold("endpoint: ") + batchAPI.Endpoint + "\n"
 
 	out += "\n" + apiHistoryTable(batchAPI.APIVersions)
diff --git a/pkg/operator/resources/job/batchapi/api.go b/pkg/operator/resources/job/batchapi/api.go
index dd98ddf253..cf339d90f2 100644
--- a/pkg/operator/resources/job/batchapi/api.go
+++ b/pkg/operator/resources/job/batchapi/api.go
@@ -22,6 +22,7 @@ import (
 
 	"github.com/cortexlabs/cortex/pkg/lib/errors"
 	"github.com/cortexlabs/cortex/pkg/lib/parallel"
+	"github.com/cortexlabs/cortex/pkg/lib/pointer"
 	"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
 	"github.com/cortexlabs/cortex/pkg/operator/config"
 	"github.com/cortexlabs/cortex/pkg/operator/lib/routines"
@@ -36,6 +37,8 @@ import (
 	kcore "k8s.io/api/core/v1"
 )
 
+const _batchDashboardUID = "batchapi"
+
 func UpdateAPI(apiConfig *userconfig.API, projectID string) (*spec.API, string, error) {
 	prevVirtualService, err := config.K8s.GetVirtualService(operator.K8sName(apiConfig.Name))
 	if err != nil {
@@ -285,11 +288,28 @@ func GetAPIByName(deployedResource *operator.DeployedResource) ([]schema.APIResp
 		}
 	}
 
+	dashboardURL := pointer.String(getDashboardURL(api.Name))
+
 	return []schema.APIResponse{
 		{
 			Spec:             *api,
 			BatchJobStatuses: jobStatuses,
 			Endpoint:         endpoint,
+			DashboardURL:     dashboardURL,
 		},
 	}, nil
 }
+
+func getDashboardURL(apiName string) string {
+	loadBalancerURL, err := operator.LoadBalancerURL()
+	if err != nil {
+		return ""
+	}
+
+	dashboardURL := fmt.Sprintf(
+		"%s/dashboard/d/%s/batchapi?orgId=1&refresh=30s&var-api_name=%s",
+		loadBalancerURL, _batchDashboardUID, apiName,
+	)
+
+	return dashboardURL
+}

From 652eacc9f0f0fbde9198b73b62bebea9b791a2b0 Mon Sep 17 00:00:00 2001
From: Miguel Varela Ramos <miguel@cortexlabs.com>
Date: Wed, 10 Feb 2021 19:44:47 +0100
Subject: [PATCH 26/26] Fix grafana installation in install.sh

---
 manager/install.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/manager/install.sh b/manager/install.sh
index ceb71cd8fd..6a9b07cb55 100755
--- a/manager/install.sh
+++ b/manager/install.sh
@@ -307,8 +307,8 @@ function setup_prometheus() {
 }
 
 function setup_grafana() {
-  kubectl manifests/grafana/grafana-dashboard-realtime.yaml >/dev/null
-  kubectl manifests/grafana/grafana-dashboard-batch.yaml >/dev/null
+  kubectl apply -f manifests/grafana/grafana-dashboard-realtime.yaml >/dev/null
+  kubectl apply -f manifests/grafana/grafana-dashboard-batch.yaml >/dev/null
   envsubst < manifests/grafana/grafana.yaml | kubectl apply -f - >/dev/null
 }