From 9a694243699ccba75390f7dff9e13aa0ffaff948 Mon Sep 17 00:00:00 2001 From: Omer Spillinger Date: Fri, 28 Jun 2019 18:38:50 -0700 Subject: [PATCH 1/6] Add manager --- Makefile | 6 +- cortex-installer.sh | 2042 ------------------------------ cortex.sh | 450 +++++++ docs/cluster/config.md | 20 +- docs/cluster/install.md | 38 +- docs/cluster/uninstall.md | 34 +- images/manager/Dockerfile | 23 + manager/aws.sh | 50 + manager/info.sh | 36 + manager/install.sh | 136 ++ manager/manifests/argo.yaml | 132 ++ manager/manifests/fluentd.yaml | 138 ++ manager/manifests/namespace.yaml | 18 + manager/manifests/nginx.yaml | 418 ++++++ manager/manifests/operator.yaml | 111 ++ manager/manifests/spark.yaml | 331 +++++ manager/uninstall.sh | 39 + manager/update.sh | 23 + pkg/operator/endpoints/errors.go | 2 +- pkg/operator/workloads/errors.go | 2 +- 20 files changed, 1939 insertions(+), 2110 deletions(-) delete mode 100755 cortex-installer.sh create mode 100755 cortex.sh create mode 100644 images/manager/Dockerfile create mode 100755 manager/aws.sh create mode 100755 manager/info.sh create mode 100755 manager/install.sh create mode 100644 manager/manifests/argo.yaml create mode 100644 manager/manifests/fluentd.yaml create mode 100644 manager/manifests/namespace.yaml create mode 100644 manager/manifests/nginx.yaml create mode 100644 manager/manifests/operator.yaml create mode 100644 manager/manifests/spark.yaml create mode 100755 manager/uninstall.sh create mode 100755 manager/update.sh diff --git a/Makefile b/Makefile index 7a2cbb019d..91e3ce5b8d 100644 --- a/Makefile +++ b/Makefile @@ -24,13 +24,13 @@ devstart: @./dev/operator_local.sh || true oinstall: - @./cortex-installer.sh -c=./dev/config/cortex.sh install operator + @./cortex.sh -c=./dev/config/cortex.sh install operator oupdate: - @./cortex-installer.sh -c=./dev/config/cortex.sh update operator + @./cortex.sh -c=./dev/config/cortex.sh update operator ouninstall: - @./cortex-installer.sh -c=./dev/config/cortex.sh uninstall operator + @./cortex.sh -c=./dev/config/cortex.sh uninstall operator ostop: @kubectl -n=cortex delete --ignore-not-found=true deployment operator diff --git a/cortex-installer.sh b/cortex-installer.sh deleted file mode 100755 index 630e2b0b91..0000000000 --- a/cortex-installer.sh +++ /dev/null @@ -1,2042 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Cortex Labs, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e - -############ -### HELP ### -############ - -CORTEX_SH_TMP_DIR="$HOME/.cortex-sh-tmp" - -function show_help() { - echo " -Usage: - ./cortex-installer.sh command [sub-command] [flags] - -Available Commands: - install operator install the operator (and the AWS CLI if necessary) - install cli install the CLI - install kubernetes-tools install aws-iam-authenticator, eksctl, kubectl - - uninstall operator uninstall the operator - uninstall cli uninstall the CLI - uninstall kubernetes-tools uninstall aws-iam-authenticator, eksctl, kubectl - - update operator update the operator config and restart the operator - - get endpoints show the operator and API endpoints - -Flags: - -c, --config path to a cortex config file - -h, --help -" -} - -#################### -### FLAG PARSING ### -#################### - -flag_help=false -positional_args=() - -while [[ $# -gt 0 ]]; do - key="$1" - case $key in - -c|--config) - export CORTEX_CONFIG="$2" - shift - shift - ;; - -h|--help) - flag_help="true" - shift - ;; - *) - positional_args+=("$1") - shift - ;; - esac -done - -set -- "${positional_args[@]}" -positional_args=() -for i in "$@"; do - case $i in - -c=*|--config=*) - export CORTEX_CONFIG="${i#*=}" - shift - ;; - -h=*|--help=*) - flag_help="true" - ;; - *) - positional_args+=("$1") - shift - ;; - esac -done - -set -- "${positional_args[@]}" -if [ "$flag_help" == "true" ]; then - show_help - exit 0 -fi - -for arg in "$@"; do - if [[ "$arg" == -* ]]; then - echo "unknown flag: $arg" - show_help - exit 1 - fi -done - -##################### -### CONFIGURATION ### -##################### - -if [ "$CORTEX_CONFIG" != "" ] && [ -f "$CORTEX_CONFIG" ]; then - source $CORTEX_CONFIG -fi - -set -u - -export CORTEX_VERSION_STABLE=master - -# Defaults -random_id=$(cat /dev/urandom | LC_CTYPE=C tr -dc 'a-z0-9' | fold -w 12 | head -n 1) - -export CORTEX_LOG_GROUP="${CORTEX_LOG_GROUP:-cortex}" -export CORTEX_BUCKET="${CORTEX_BUCKET:-cortex-$random_id}" -export CORTEX_REGION="${CORTEX_REGION:-us-west-2}" -export CORTEX_NAMESPACE="${CORTEX_NAMESPACE:-cortex}" - -export CORTEX_IMAGE_ARGO_CONTROLLER="${CORTEX_IMAGE_ARGO_CONTROLLER:-cortexlabs/argo-controller:$CORTEX_VERSION_STABLE}" -export CORTEX_IMAGE_ARGO_EXECUTOR="${CORTEX_IMAGE_ARGO_EXECUTOR:-cortexlabs/argo-executor:$CORTEX_VERSION_STABLE}" -export CORTEX_IMAGE_FLUENTD="${CORTEX_IMAGE_FLUENTD:-cortexlabs/fluentd:$CORTEX_VERSION_STABLE}" -export CORTEX_IMAGE_NGINX_BACKEND="${CORTEX_IMAGE_NGINX_BACKEND:-cortexlabs/nginx-backend:$CORTEX_VERSION_STABLE}" -export CORTEX_IMAGE_NGINX_CONTROLLER="${CORTEX_IMAGE_NGINX_CONTROLLER:-cortexlabs/nginx-controller:$CORTEX_VERSION_STABLE}" -export CORTEX_IMAGE_OPERATOR="${CORTEX_IMAGE_OPERATOR:-cortexlabs/operator:$CORTEX_VERSION_STABLE}" -export CORTEX_IMAGE_SPARK="${CORTEX_IMAGE_SPARK:-cortexlabs/spark:$CORTEX_VERSION_STABLE}" -export CORTEX_IMAGE_SPARK_OPERATOR="${CORTEX_IMAGE_SPARK_OPERATOR:-cortexlabs/spark-operator:$CORTEX_VERSION_STABLE}" -export CORTEX_IMAGE_TF_SERVE="${CORTEX_IMAGE_TF_SERVE:-cortexlabs/tf-serve:$CORTEX_VERSION_STABLE}" -export CORTEX_IMAGE_TF_TRAIN="${CORTEX_IMAGE_TF_TRAIN:-cortexlabs/tf-train:$CORTEX_VERSION_STABLE}" -export CORTEX_IMAGE_TF_API="${CORTEX_IMAGE_TF_API:-cortexlabs/tf-api:$CORTEX_VERSION_STABLE}" -export CORTEX_IMAGE_PYTHON_PACKAGER="${CORTEX_IMAGE_PYTHON_PACKAGER:-cortexlabs/python-packager:$CORTEX_VERSION_STABLE}" -export CORTEX_IMAGE_TF_SERVE_GPU="${CORTEX_IMAGE_TF_SERVE_GPU:-cortexlabs/tf-serve-gpu:$CORTEX_VERSION_STABLE}" -export CORTEX_IMAGE_TF_TRAIN_GPU="${CORTEX_IMAGE_TF_TRAIN_GPU:-cortexlabs/tf-train-gpu:$CORTEX_VERSION_STABLE}" - -export AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID:-""}" -export AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY:-""}" -export CORTEX_ENABLE_TELEMETRY=${CORTEX_ENABLE_TELEMETRY:-""} - -################ -### CHECK OS ### -################ - -case "$OSTYPE" in - darwin*) PARSED_OS="darwin" ;; - linux*) PARSED_OS="linux" ;; - *) echo -e "\nerror: only mac and linux are supported"; exit 1 ;; -esac - -########################## -### TOP-LEVEL COMMANDS ### -########################## - -function install_operator() { - check_dep_curl - check_dep_aws - check_dep_kubectl - - setup_bucket - setup_cloudwatch_logs - - prompt_for_telemetry - - echo -e "\nInstalling the Cortex operator ..." - - setup_namespace - setup_configmap - setup_secrets - setup_spark - setup_argo - setup_nginx - setup_fluentd - setup_operator - - validate_cortex -} - -function install_cli() { - install_cortex_cli -} - -function install_kubernetes_tools() { - install_aws_iam_authenticator - install_eksctl - install_kubectl -} - -function uninstall_operator() { - check_dep_kubectl - - echo - if kubectl get namespace $CORTEX_NAMESPACE >/dev/null 2>&1 || kubectl get customresourcedefinition sparkapplications.sparkoperator.k8s.io >/dev/null 2>&1 || kubectl get customresourcedefinition scheduledsparkapplications.sparkoperator.k8s.io >/dev/null 2>&1 || kubectl get customresourcedefinition workflows.argoproj.io >/dev/null 2>&1; then - echo "Uninstalling the Cortex operator from your Kubernetes cluster ..." - - # Remove finalizers on sparkapplications (they sometimes create deadlocks) - if kubectl get namespace $CORTEX_NAMESPACE >/dev/null 2>&1 && kubectl get customresourcedefinition sparkapplications.sparkoperator.k8s.io >/dev/null 2>&1; then - set +e - kubectl -n=$CORTEX_NAMESPACE get sparkapplications.sparkoperator.k8s.io -o name | xargs -L1 \ - kubectl -n=$CORTEX_NAMESPACE patch -p '{"metadata":{"finalizers": []}}' --type=merge >/dev/null 2>&1 - set -e - fi - - kubectl delete --ignore-not-found=true customresourcedefinition scheduledsparkapplications.sparkoperator.k8s.io >/dev/null 2>&1 - kubectl delete --ignore-not-found=true customresourcedefinition sparkapplications.sparkoperator.k8s.io >/dev/null 2>&1 - kubectl delete --ignore-not-found=true customresourcedefinition workflows.argoproj.io >/dev/null 2>&1 - kubectl delete --ignore-not-found=true namespace $CORTEX_NAMESPACE >/dev/null 2>&1 - echo "✓ Uninstalled the Cortex operator" - else - echo "The Cortex operator is not installed on your Kubernetes cluster" - fi -} - -function uninstall_cli() { - uninstall_cortex_cli -} - -function uninstall_kubernetes_tools() { - uninstall_kubectl - uninstall_eksctl - uninstall_aws_iam_authenticator -} - -# Note: if namespace is changed, the old namespace will not be deleted -function update_operator() { - check_dep_curl - check_dep_aws - check_dep_kubectl - - kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true deployment operator >/dev/null 2>&1 - kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset fluentd >/dev/null 2>&1 # Pods in DaemonSets cannot be modified - install_operator -} - -function get_endpoints() { - check_dep_kubectl - - operator_endpoint=$(get_operator_endpoint) - apis_endpoint=$(get_apis_endpoint) - echo - echo "Operator endpoint: $operator_endpoint" - echo "APIs endpoint: $apis_endpoint" -} - -################# -### AWS SETUP ### -################# - -function setup_bucket() { - if ! aws s3api head-bucket --bucket $CORTEX_BUCKET --output json 2>/dev/null; then - if aws s3 ls "s3://$CORTEX_BUCKET" --output json 2>&1 | grep -q 'NoSuchBucket'; then - echo -e "\nCreating S3 bucket: $CORTEX_BUCKET" - aws s3api create-bucket --bucket $CORTEX_BUCKET \ - --region $CORTEX_REGION \ - --create-bucket-configuration LocationConstraint=$CORTEX_REGION \ - >/dev/null - else - echo -e "\nA bucket named \"${CORTEX_BUCKET}\" already exists, but you do not have access to it" - exit 1 - fi - else - echo -e "\nUsing existing S3 bucket: $CORTEX_BUCKET" - fi -} - -function setup_cloudwatch_logs() { - if ! aws logs list-tags-log-group --log-group-name $CORTEX_LOG_GROUP --region $CORTEX_REGION --output json 2>&1 | grep -q "\"tags\":"; then - echo -e "\nCreating CloudWatch log group: $CORTEX_LOG_GROUP" - aws logs create-log-group --log-group-name $CORTEX_LOG_GROUP --region $CORTEX_REGION - else - echo -e "\nUsing existing CloudWatch log group: $CORTEX_LOG_GROUP" - fi -} - -####################### -### NAMESPACE SETUP ### -####################### - -function setup_namespace() { - echo " -apiVersion: v1 -kind: Namespace -metadata: - name: ${CORTEX_NAMESPACE} -" | kubectl apply -f - >/dev/null -} - -####################### -### CONFIGMAP SETUP ### -####################### - -function setup_configmap() { - kubectl -n=$CORTEX_NAMESPACE create configmap 'cortex-config' \ - --from-literal='LOG_GROUP'=$CORTEX_LOG_GROUP \ - --from-literal='BUCKET'=$CORTEX_BUCKET \ - --from-literal='REGION'=$CORTEX_REGION \ - --from-literal='NAMESPACE'=$CORTEX_NAMESPACE \ - --from-literal='IMAGE_OPERATOR'=$CORTEX_IMAGE_OPERATOR \ - --from-literal='IMAGE_SPARK'=$CORTEX_IMAGE_SPARK \ - --from-literal='IMAGE_TF_TRAIN'=$CORTEX_IMAGE_TF_TRAIN \ - --from-literal='IMAGE_TF_SERVE'=$CORTEX_IMAGE_TF_SERVE \ - --from-literal='IMAGE_TF_API'=$CORTEX_IMAGE_TF_API \ - --from-literal='IMAGE_PYTHON_PACKAGER'=$CORTEX_IMAGE_PYTHON_PACKAGER \ - --from-literal='IMAGE_TF_TRAIN_GPU'=$CORTEX_IMAGE_TF_TRAIN_GPU \ - --from-literal='IMAGE_TF_SERVE_GPU'=$CORTEX_IMAGE_TF_SERVE_GPU \ - --from-literal='ENABLE_TELEMETRY'=$CORTEX_ENABLE_TELEMETRY \ - -o yaml --dry-run | kubectl apply -f - >/dev/null -} - -####################### -### SECRETS SETUP ### -####################### - -function setup_secrets() { - kubectl -n=$CORTEX_NAMESPACE create secret generic 'aws-credentials' \ - --from-literal='AWS_ACCESS_KEY_ID'=$AWS_ACCESS_KEY_ID \ - --from-literal='AWS_SECRET_ACCESS_KEY'=$AWS_SECRET_ACCESS_KEY \ - -o yaml --dry-run | kubectl apply -f - >/dev/null -} - -################## -### ARGO SETUP ### -################## - -function setup_argo() { - echo " -apiVersion: v1 -kind: ServiceAccount -metadata: - name: argo-executor - namespace: ${CORTEX_NAMESPACE} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: argo-executor - namespace: ${CORTEX_NAMESPACE} -subjects: -- kind: ServiceAccount - name: argo-executor - namespace: ${CORTEX_NAMESPACE} -roleRef: - kind: ClusterRole - name: cluster-admin - apiGroup: rbac.authorization.k8s.io ---- -apiVersion: apiextensions.k8s.io/v1beta1 -kind: CustomResourceDefinition -metadata: - name: workflows.argoproj.io - namespace: ${CORTEX_NAMESPACE} -spec: - group: argoproj.io - names: - kind: Workflow - plural: workflows - shortNames: - - wf - scope: Namespaced - version: v1alpha1 ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: argo-controller - namespace: ${CORTEX_NAMESPACE} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: argo-controller - namespace: ${CORTEX_NAMESPACE} -rules: -- apiGroups: [\"\"] - resources: [pods, pods/exec] - verbs: [create, get, list, watch, update, patch, delete] -- apiGroups: [\"\"] - resources: [configmaps] - verbs: [get, watch, list] -- apiGroups: [\"\"] - resources: [persistentvolumeclaims] - verbs: [create, delete] -- apiGroups: [argoproj.io] - resources: [workflows, workflows/finalizers] - verbs: [get, list, watch, update, patch, delete] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: argo - namespace: ${CORTEX_NAMESPACE} -subjects: -- kind: ServiceAccount - name: argo-controller - namespace: ${CORTEX_NAMESPACE} -roleRef: - kind: Role - name: argo-controller - apiGroup: rbac.authorization.k8s.io ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: argo-controller - namespace: ${CORTEX_NAMESPACE} -data: - config: | - namespace: ${CORTEX_NAMESPACE} ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: argo-controller - namespace: ${CORTEX_NAMESPACE} -spec: - selector: - matchLabels: - app: argo-controller - template: - metadata: - labels: - app: argo-controller - spec: - containers: - - args: - - --configmap - - argo-controller - - --executor-image - - ${CORTEX_IMAGE_ARGO_EXECUTOR} - - --executor-image-pull-policy - - Always - command: - - workflow-controller - image: ${CORTEX_IMAGE_ARGO_CONTROLLER} - imagePullPolicy: Always - name: argo-controller - serviceAccountName: argo-controller -" | kubectl apply -f - >/dev/null -} - -################### -### SPARK SETUP ### -################### - -function setup_spark() { - echo " -apiVersion: v1 -kind: ServiceAccount -metadata: - name: spark-operator - namespace: ${CORTEX_NAMESPACE} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: spark-operator - namespace: ${CORTEX_NAMESPACE} -rules: -- apiGroups: [\"\"] - resources: [pods] - verbs: [\"*\"] -- apiGroups: [\"\"] - resources: [services, configmaps, secrets] - verbs: [create, get, delete] -- apiGroups: [extensions] - resources: [ingresses] - verbs: [create, get, delete] -- apiGroups: [\"\"] - resources: [nodes] - verbs: [get] -- apiGroups: [\"\"] - resources: [events] - verbs: [create, update, patch] -- apiGroups: [apiextensions.k8s.io] - resources: [customresourcedefinitions] - verbs: [create, get, update, delete] -- apiGroups: [admissionregistration.k8s.io] - resources: [mutatingwebhookconfigurations] - verbs: [create, get, update, delete] -- apiGroups: [sparkoperator.k8s.io] - resources: [sparkapplications, scheduledsparkapplications] - verbs: [\"*\"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: spark-operator - namespace: ${CORTEX_NAMESPACE} -subjects: - - kind: ServiceAccount - name: spark-operator - namespace: ${CORTEX_NAMESPACE} -roleRef: - kind: Role - name: spark-operator - apiGroup: rbac.authorization.k8s.io ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: spark-operator - namespace: ${CORTEX_NAMESPACE} - labels: - app.kubernetes.io/name: spark-operator - app.kubernetes.io/version: v2.4.0-v1alpha1 -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: spark-operator - app.kubernetes.io/version: v2.4.0-v1alpha1 - strategy: - type: Recreate - template: - metadata: - labels: - app.kubernetes.io/name: spark-operator - app.kubernetes.io/version: v2.4.0-v1alpha1 - initializers: - pending: [] - spec: - serviceAccountName: spark-operator - containers: - - name: spark-operator - image: ${CORTEX_IMAGE_SPARK_OPERATOR} - imagePullPolicy: Always - command: [\"/usr/bin/spark-operator\"] - args: - - -namespace=${CORTEX_NAMESPACE} - - -install-crds=false - - -logtostderr ---- -apiVersion: apiextensions.k8s.io/v1beta1 -kind: CustomResourceDefinition -metadata: - name: sparkapplications.sparkoperator.k8s.io -spec: - group: sparkoperator.k8s.io - names: - kind: SparkApplication - listKind: SparkApplicationList - plural: sparkapplications - shortNames: - - sparkapp - singular: sparkapplication - scope: Namespaced - validation: - openAPIV3Schema: - properties: - spec: - properties: - deps: - properties: - downloadTimeout: - minimum: 1 - type: integer - maxSimultaneousDownloads: - minimum: 1 - type: integer - driver: - properties: - cores: - exclusiveMinimum: true - minimum: 0 - type: number - podName: - pattern: '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*' - executor: - properties: - cores: - exclusiveMinimum: true - minimum: 0 - type: number - instances: - minimum: 1 - type: integer - mode: - enum: - - cluster - - client - monitoring: - properties: - prometheus: - properties: - port: - maximum: 49151 - minimum: 1024 - type: integer - pythonVersion: - enum: - - \"2\" - - \"3\" - restartPolicy: - properties: - onFailureRetries: - minimum: 0 - type: integer - onFailureRetryInterval: - minimum: 1 - type: integer - onSubmissionFailureRetries: - minimum: 0 - type: integer - onSubmissionFailureRetryInterval: - minimum: 1 - type: integer - type: - enum: - - Never - - OnFailure - - Always - type: - enum: - - Java - - Scala - - Python - - R - version: v1alpha1 ---- -apiVersion: apiextensions.k8s.io/v1beta1 -kind: CustomResourceDefinition -metadata: - name: scheduledsparkapplications.sparkoperator.k8s.io -spec: - group: sparkoperator.k8s.io - names: - kind: ScheduledSparkApplication - listKind: ScheduledSparkApplicationList - plural: scheduledsparkapplications - shortNames: - - scheduledsparkapp - singular: scheduledsparkapplication - scope: Namespaced - validation: - openAPIV3Schema: - properties: - spec: - properties: - concurrencyPolicy: - enum: - - Allow - - Forbid - - Replace - failedRunHistoryLimit: - minimum: 1 - type: integer - schedule: - type: string - successfulRunHistoryLimit: - minimum: 1 - type: integer - template: - properties: - deps: - properties: - downloadTimeout: - minimum: 1 - type: integer - maxSimultaneousDownloads: - minimum: 1 - type: integer - driver: - properties: - cores: - exclusiveMinimum: true - minimum: 0 - type: number - podName: - pattern: '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*' - executor: - properties: - cores: - exclusiveMinimum: true - minimum: 0 - type: number - instances: - minimum: 1 - type: integer - mode: - enum: - - cluster - - client - monitoring: - properties: - prometheus: - properties: - port: - maximum: 49151 - minimum: 1024 - type: integer - pythonVersion: - enum: - - \"2\" - - \"3\" - restartPolicy: - properties: - onFailureRetries: - minimum: 0 - type: integer - onFailureRetryInterval: - minimum: 1 - type: integer - onSubmissionFailureRetries: - minimum: 0 - type: integer - onSubmissionFailureRetryInterval: - minimum: 1 - type: integer - type: - enum: - - Never - - OnFailure - - Always - type: - enum: - - Java - - Scala - - Python - - R - version: v1alpha1 ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: spark - namespace: ${CORTEX_NAMESPACE} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: spark - namespace: ${CORTEX_NAMESPACE} -rules: -- apiGroups: - - \"\" # \"\" indicates the core API group - resources: [pods] - verbs: [\"*\"] -- apiGroups: - - \"\" # \"\" indicates the core API group - resources: [services] - verbs: [\"*\"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: spark - namespace: ${CORTEX_NAMESPACE} -subjects: -- kind: ServiceAccount - name: spark - namespace: ${CORTEX_NAMESPACE} -roleRef: - kind: Role - name: spark - apiGroup: rbac.authorization.k8s.io -" | kubectl apply -f - >/dev/null -} - -################### -### NGINX SETUP ### -################### - -function setup_nginx() { - echo " ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: nginx - namespace: ${CORTEX_NAMESPACE} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: nginx - namespace: ${CORTEX_NAMESPACE} -rules: - - apiGroups: [\"\"] - resources: [endpoints, pods, secrets] - verbs: [list, watch] - - apiGroups: [\"\"] - resources: [nodes, services, ingresses] - verbs: [get, list, watch] - - apiGroups: [\"\"] - resources: [events] - verbs: [create, patch] - - apiGroups: [\"extensions\"] - resources: [ingresses] - verbs: [get, list, watch] - - apiGroups: [\"extensions\"] - resources: [ingresses/status] - verbs: [update] - - apiGroups: [\"\"] - resources: [pods, secrets, namespaces, endpoints] - verbs: [get] - - apiGroups: [\"\"] - resources: [configmaps] - resourceNames: - # Defaults to \"-\" - # Here: \"-\" - # This has to be adapted if you change either parameter - # when launching the nginx-ingress-controller. - - \"ingress-controller-leader-operator\" - - \"ingress-controller-leader-apis\" - verbs: [get, update] - - apiGroups: [\"\"] - resources: [configmaps] - verbs: [get, list, watch, create] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: nginx - namespace: ${CORTEX_NAMESPACE} -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: Role - name: nginx -subjects: - - kind: ServiceAccount - name: nginx - namespace: ${CORTEX_NAMESPACE} ---- -kind: ConfigMap -apiVersion: v1 -metadata: - name: nginx-configuration - namespace: ${CORTEX_NAMESPACE} -data: - use-proxy-protocol: \"true\" ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: nginx-backend-operator - labels: - app.kubernetes.io/name: nginx-backend-operator - app.kubernetes.io/part-of: ingress-nginx - namespace: ${CORTEX_NAMESPACE} -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: nginx-backend-operator - app.kubernetes.io/part-of: ingress-nginx - template: - metadata: - labels: - app.kubernetes.io/name: nginx-backend-operator - app.kubernetes.io/part-of: ingress-nginx - spec: - terminationGracePeriodSeconds: 60 - containers: - - name: nginx-backend-operator - # Any image is permissible as long as: - # 1. It serves a 404 page at / - # 2. It serves 200 on a /healthz endpoint - image: ${CORTEX_IMAGE_NGINX_BACKEND} - imagePullPolicy: Always - livenessProbe: - httpGet: - path: /healthz - port: 8080 - scheme: HTTP - initialDelaySeconds: 30 - timeoutSeconds: 5 - ports: - - containerPort: 8080 - resources: - limits: - cpu: 10m - memory: 20Mi - requests: - cpu: 10m - memory: 20Mi ---- -apiVersion: v1 -kind: Service -metadata: - name: nginx-backend-operator - namespace: ${CORTEX_NAMESPACE} - labels: - app.kubernetes.io/name: nginx-backend-operator - app.kubernetes.io/part-of: ingress-nginx -spec: - ports: - - port: 80 - targetPort: 8080 - selector: - app.kubernetes.io/name: nginx-backend-operator - app.kubernetes.io/part-of: ingress-nginx ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: nginx-controller-operator - namespace: ${CORTEX_NAMESPACE} - labels: - app.kubernetes.io/name: nginx-controller-operator - app.kubernetes.io/part-of: ingress-nginx -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: nginx-controller-operator - app.kubernetes.io/part-of: ingress-nginx - template: - metadata: - labels: - app.kubernetes.io/name: nginx-controller-operator - app.kubernetes.io/part-of: ingress-nginx - spec: - serviceAccountName: nginx - containers: - - name: nginx-controller - image: ${CORTEX_IMAGE_NGINX_CONTROLLER} - imagePullPolicy: Always - args: - - /nginx-ingress-controller - - --watch-namespace=${CORTEX_NAMESPACE} - - --default-backend-service=${CORTEX_NAMESPACE}/nginx-backend-operator - - --configmap=${CORTEX_NAMESPACE}/nginx-configuration - - --publish-service=${CORTEX_NAMESPACE}/nginx-controller-operator - - --annotations-prefix=nginx.ingress.kubernetes.io - - --ingress-class=operator - securityContext: - capabilities: - drop: - - ALL - add: - - NET_BIND_SERVICE - # www-data -> 33 - runAsUser: 33 - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - ports: - - name: http - containerPort: 80 - - name: https - containerPort: 443 - livenessProbe: - failureThreshold: 3 - httpGet: - path: /healthz - port: 10254 - scheme: HTTP - initialDelaySeconds: 10 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - readinessProbe: - failureThreshold: 3 - httpGet: - path: /healthz - port: 10254 - scheme: HTTP - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 ---- -kind: Service -apiVersion: v1 -metadata: - name: nginx-controller-operator - namespace: ${CORTEX_NAMESPACE} - labels: - app.kubernetes.io/name: nginx-controller-operator - app.kubernetes.io/part-of: ingress-nginx - annotations: - # Enable PROXY protocol - service.beta.kubernetes.io/aws-load-balancer-proxy-protocol: '*' - # Ensure the ELB idle timeout is less than nginx keep-alive timeout. By default, - # NGINX keep-alive is set to 75s. If using WebSockets, the value will need to be - # increased to '3600' to avoid any potential issues. - service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: '60' -spec: - type: LoadBalancer - selector: - app.kubernetes.io/name: nginx-controller-operator - app.kubernetes.io/part-of: ingress-nginx - ports: - - name: http - port: 80 - targetPort: http - - name: https - port: 443 - targetPort: https ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: nginx-backend-apis - labels: - app.kubernetes.io/name: nginx-backend-apis - app.kubernetes.io/part-of: ingress-nginx - namespace: ${CORTEX_NAMESPACE} -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: nginx-backend-apis - app.kubernetes.io/part-of: ingress-nginx - template: - metadata: - labels: - app.kubernetes.io/name: nginx-backend-apis - app.kubernetes.io/part-of: ingress-nginx - spec: - terminationGracePeriodSeconds: 60 - containers: - - name: nginx-backend-apis - # Any image is permissible as long as: - # 1. It serves a 404 page at / - # 2. It serves 200 on a /healthz endpoint - image: ${CORTEX_IMAGE_NGINX_BACKEND} - imagePullPolicy: Always - livenessProbe: - httpGet: - path: /healthz - port: 8080 - scheme: HTTP - initialDelaySeconds: 30 - timeoutSeconds: 5 - ports: - - containerPort: 8080 - resources: - limits: - cpu: 10m - memory: 20Mi - requests: - cpu: 10m - memory: 20Mi ---- -apiVersion: v1 -kind: Service -metadata: - name: nginx-backend-apis - namespace: ${CORTEX_NAMESPACE} - labels: - app.kubernetes.io/name: nginx-backend-apis - app.kubernetes.io/part-of: ingress-nginx -spec: - ports: - - port: 80 - targetPort: 8080 - selector: - app.kubernetes.io/name: nginx-backend-apis - app.kubernetes.io/part-of: ingress-nginx ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: nginx-controller-apis - namespace: ${CORTEX_NAMESPACE} - labels: - app.kubernetes.io/name: nginx-backend-apis - app.kubernetes.io/part-of: ingress-nginx -spec: - replicas: 1 - selector: - matchLabels: - app.kubernetes.io/name: nginx-backend-apis - app.kubernetes.io/part-of: ingress-nginx - template: - metadata: - labels: - app.kubernetes.io/name: nginx-backend-apis - app.kubernetes.io/part-of: ingress-nginx - spec: - serviceAccountName: nginx - containers: - - name: nginx-controller - image: ${CORTEX_IMAGE_NGINX_CONTROLLER} - imagePullPolicy: Always - args: - - /nginx-ingress-controller - - --watch-namespace=${CORTEX_NAMESPACE} - - --default-backend-service=${CORTEX_NAMESPACE}/nginx-backend-apis - - --configmap=${CORTEX_NAMESPACE}/nginx-configuration - - --publish-service=${CORTEX_NAMESPACE}/nginx-backend-apis - - --annotations-prefix=nginx.ingress.kubernetes.io - - --ingress-class=apis - securityContext: - capabilities: - drop: - - ALL - add: - - NET_BIND_SERVICE - # www-data -> 33 - runAsUser: 33 - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - ports: - - name: http - containerPort: 80 - - name: https - containerPort: 443 - livenessProbe: - failureThreshold: 3 - httpGet: - path: /healthz - port: 10254 - scheme: HTTP - initialDelaySeconds: 10 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - readinessProbe: - failureThreshold: 3 - httpGet: - path: /healthz - port: 10254 - scheme: HTTP - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 ---- -kind: Service -apiVersion: v1 -metadata: - name: nginx-controller-apis - namespace: ${CORTEX_NAMESPACE} - labels: - app.kubernetes.io/name: nginx-backend-apis - app.kubernetes.io/part-of: ingress-nginx - annotations: - # Enable PROXY protocol - service.beta.kubernetes.io/aws-load-balancer-proxy-protocol: '*' - # Ensure the ELB idle timeout is less than nginx keep-alive timeout. By default, - # NGINX keep-alive is set to 75s. If using WebSockets, the value will need to be - # increased to '3600' to avoid any potential issues. - service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: '60' -spec: - type: LoadBalancer - selector: - app.kubernetes.io/name: nginx-backend-apis - app.kubernetes.io/part-of: ingress-nginx - ports: - - name: http - port: 80 - targetPort: http - - name: https - port: 443 - targetPort: https -" | kubectl apply -f - >/dev/null -} - -##################### -### FLUENTD SETUP ### -##################### - -function setup_fluentd() { - echo " -apiVersion: v1 -kind: ServiceAccount -metadata: - name: fluentd - namespace: ${CORTEX_NAMESPACE} - labels: - app: fluentd ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: fluentd - namespace: ${CORTEX_NAMESPACE} -rules: -- apiGroups: [\"\"] - resources: [pods] - verbs: [get, list, watch] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: fluentd - namespace: ${CORTEX_NAMESPACE} -subjects: -- kind: ServiceAccount - name: fluentd - namespace: ${CORTEX_NAMESPACE} -roleRef: - kind: Role - name: fluentd - apiGroup: rbac.authorization.k8s.io ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: fluentd - namespace: ${CORTEX_NAMESPACE} -data: - fluent.conf: | - - @type null - - - @type tail - enable_stat_watcher false - path /var/log/containers/**_${CORTEX_NAMESPACE}_**.log - pos_file /var/log/fluentd-containers.log.pos - time_format %Y-%m-%dT%H:%M:%S.%NZ - tag * - format json - read_from_head true - - - @type cloudwatch_logs - log_group_name \"#{ENV['LOG_GROUP_NAME']}\" - auto_create_stream true - use_tag_as_stream true - ---- -apiVersion: extensions/v1beta1 -kind: DaemonSet -metadata: - name: fluentd - namespace: ${CORTEX_NAMESPACE} -spec: - template: - metadata: - labels: - app: fluentd - spec: - serviceAccountName: fluentd - initContainers: - - name: copy-fluentd-config - image: busybox - command: ['sh', '-c', 'cp /config-volume/* /etc/fluentd'] - volumeMounts: - - name: config-volume - mountPath: /config-volume - - name: config - mountPath: /etc/fluentd - containers: - - name: fluentd - image: ${CORTEX_IMAGE_FLUENTD} - imagePullPolicy: Always - env: - - name: AWS_REGION - value: ${CORTEX_REGION} - - name: LOG_GROUP_NAME - value: ${CORTEX_LOG_GROUP} - - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - name: aws-credentials - key: AWS_ACCESS_KEY_ID - - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - name: aws-credentials - key: AWS_SECRET_ACCESS_KEY - volumeMounts: - - name: varlog - mountPath: /var/log - - name: varlibdockercontainers - mountPath: /var/lib/docker/containers - readOnly: true - - name: config - mountPath: /fluentd/etc - terminationGracePeriodSeconds: 30 - volumes: - - name: varlog - hostPath: - path: /var/log - - name: varlibdockercontainers - hostPath: - path: /var/lib/docker/containers - - name: config - emptyDir: {} - - name: config-volume - configMap: - name: fluentd -" | kubectl apply -f - >/dev/null -} - -###################### -### OPERATOR SETUP ### -###################### - -function setup_operator() { - echo " -apiVersion: v1 -kind: ServiceAccount -metadata: - name: operator - namespace: ${CORTEX_NAMESPACE} ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: operator - namespace: ${CORTEX_NAMESPACE} -subjects: -- kind: ServiceAccount - name: operator - namespace: ${CORTEX_NAMESPACE} -roleRef: - kind: ClusterRole - name: cluster-admin - apiGroup: rbac.authorization.k8s.io ---- -apiVersion: apps/v1 -kind: Deployment -metadata: - name: operator - namespace: ${CORTEX_NAMESPACE} - labels: - workloadType: operator -spec: - replicas: 1 - selector: - matchLabels: - workloadId: operator - template: - metadata: - labels: - workloadId: operator - workloadType: operator - spec: - containers: - - name: operator - image: ${CORTEX_IMAGE_OPERATOR} - imagePullPolicy: Always - env: - - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - name: aws-credentials - key: AWS_ACCESS_KEY_ID - - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - name: aws-credentials - key: AWS_SECRET_ACCESS_KEY - volumeMounts: - - name: cortex-config - mountPath: /configs/cortex - volumes: - - name: cortex-config - configMap: - name: cortex-config - serviceAccountName: operator ---- -kind: Service -apiVersion: v1 -metadata: - name: operator - namespace: ${CORTEX_NAMESPACE} - labels: - workloadType: operator -spec: - selector: - workloadId: operator - ports: - - port: 8888 - targetPort: 8888 ---- -apiVersion: extensions/v1beta1 -kind: Ingress -metadata: - name: operator - namespace: ${CORTEX_NAMESPACE} - labels: - workloadType: operator - annotations: - kubernetes.io/ingress.class: operator -spec: - rules: - - http: - paths: - - path: / - backend: - serviceName: operator - servicePort: 8888 -" | kubectl apply -f - >/dev/null -} - -function validate_cortex() { - set +e - - echo -en "\nWaiting for the Cortex operator to be ready " - - operator_load_balancer="waiting" - api_load_balancer="waiting" - operator_endpoint_reachable="waiting" - operator_pod_ready_cycles=0 - operator_endpoint="" - - while true; do - echo -n "." - sleep 5 - - operator_pod_name=$(kubectl -n=$CORTEX_NAMESPACE get pods -o=name --sort-by=.metadata.creationTimestamp | grep "^pod/operator-" | tail -1) - if [ "$operator_pod_name" == "" ]; then - operator_pod_ready_cycles=0 - else - is_ready=$(kubectl -n=$CORTEX_NAMESPACE get "$operator_pod_name" -o jsonpath='{.status.containerStatuses[0].ready}') - if [ "$is_ready" == "true" ]; then - ((operator_pod_ready_cycles++)) - else - operator_pod_ready_cycles=0 - fi - fi - - if [ "$operator_load_balancer" != "ready" ]; then - out=$(kubectl -n=$CORTEX_NAMESPACE get service nginx-controller-operator -o json | tr -d '[:space:]') - if [[ $out != *'"loadBalancer":{"ingress":[{"'* ]]; then - continue - fi - operator_load_balancer="ready" - fi - - if [ "$api_load_balancer" != "ready" ]; then - out=$(kubectl -n=$CORTEX_NAMESPACE get service nginx-controller-apis -o json | tr -d '[:space:]') - if [[ $out != *'"loadBalancer":{"ingress":[{"'* ]]; then - continue - fi - api_load_balancer="ready" - fi - - if [ "$operator_endpoint" = "" ]; then - operator_endpoint=$(kubectl -n=$CORTEX_NAMESPACE get service nginx-controller-operator -o json | tr -d '[:space:]' | sed 's/.*{\"hostname\":\"\(.*\)\".*/\1/') - fi - - if [ "$operator_endpoint_reachable" != "ready" ]; then - if ! curl $operator_endpoint >/dev/null 2>&1; then - continue - fi - operator_endpoint_reachable="ready" - fi - - if [ "$operator_pod_ready_cycles" == "0" ] && [ "$operator_pod_name" != "" ]; then - num_restart=$(kubectl -n=$CORTEX_NAMESPACE get "$operator_pod_name" -o jsonpath='{.status.containerStatuses[0].restartCount}') - if [[ $num_restart -ge 2 ]]; then - echo -e "\n\nAn error occurred when starting the Cortex operator. View the logs with:" - echo " kubectl logs $operator_pod_name --namespace=$CORTEX_NAMESPACE" - exit 1 - fi - continue - fi - - if [[ $operator_pod_ready_cycles -lt 3 ]]; then - continue - fi - - echo " ✓" - break - done - - echo -e "\nCortex is ready!" - - get_endpoints - - if command -v cortex >/dev/null; then - echo -e "\nPlease run \`cortex configure\` to make sure your CLI is configured correctly" - fi -} - -function get_operator_endpoint() { - set -eo pipefail - kubectl -n=$CORTEX_NAMESPACE get service nginx-controller-operator -o json | tr -d '[:space:]' | sed 's/.*{\"hostname\":\"\(.*\)\".*/\1/' -} - -function get_apis_endpoint() { - set -eo pipefail - kubectl -n=$CORTEX_NAMESPACE get service nginx-controller-apis -o json | tr -d '[:space:]' | sed 's/.*{\"hostname\":\"\(.*\)\".*/\1/' -} - -############################# -### DEPENDENCY MANAGEMENT ### -############################# - -function check_dep_curl() { - if ! command -v curl >/dev/null; then - echo -e "\nerror: please install \`curl\`" - exit 1 - fi -} - -function check_dep_unzip() { - if ! command -v unzip >/dev/null; then - echo -e "\nerror: please install \`unzip\`" - exit 1 - fi -} - -function check_dep_kubectl() { - set -e - - if ! command -v kubectl >/dev/null 2>&1; then - echo - read -p "kubectl is required. Would you like cortex-installer.sh to install it? [Y/n] " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - install_kubectl - else - exit 1 - fi - fi - - if ! kubectl config current-context >/dev/null 2>&1; then - echo -e "\nerror: kubectl is not configured to connect with your cluster. If you are using eksctl, you can run this command to configure kubectl:" - echo " eksctl utils write-kubeconfig --name=cortex" - exit 1 - fi - - set +e - get_nodes_output=$(kubectl get nodes -o jsonpath='{range .items[*]}{@.metadata.name}:{range @.status.conditions[*]}{@.type}={@.status};{end}{end}' 2>/dev/null) - if [ $? -ne 0 ]; then - echo -e "\nerror: either your AWS credentials are incorrect or kubectl is not properly configured to connect with your cluster" - echo "If you are using eksctl, you can run this command to re-configure kubectl:" - echo " eksctl utils write-kubeconfig --name=cortex" - echo "If you are changing IAM users, you must edit the aws-auth ConfigMap (using your previous IAM credentials) to add the new IAM user; see https://docs.aws.amazon.com/eks/latest/userguide/add-user-role.html" - exit 1 - fi - set -e - num_nodes_ready=$(echo $get_nodes_output | tr ';' "\n" | grep "Ready=True" | wc -l) - if ! [[ $num_nodes_ready -ge 1 ]]; then - echo -e "\nerror: your cluster has no registered nodes" - exit 1 - fi -} - -function install_kubectl() { - set -e - - if command -v kubectl >/dev/null; then - echo -e "\nkubectl is already installed" - return - fi - - check_dep_curl - - echo -e "\nInstalling kubectl (/usr/local/bin/kubectl) ..." - - rm -rf $CORTEX_SH_TMP_DIR && mkdir -p $CORTEX_SH_TMP_DIR - curl -s -Lo $CORTEX_SH_TMP_DIR/kubectl https://storage.googleapis.com/kubernetes-release/release/v1.13.3/bin/$PARSED_OS/amd64/kubectl - chmod +x $CORTEX_SH_TMP_DIR/kubectl - - if [ $(id -u) = 0 ]; then - mv $CORTEX_SH_TMP_DIR/kubectl /usr/local/bin/kubectl - else - ask_sudo - sudo mv $CORTEX_SH_TMP_DIR/kubectl /usr/local/bin/kubectl - fi - - rm -rf $CORTEX_SH_TMP_DIR - echo "✓ Installed kubectl" -} - -function uninstall_kubectl() { - set -e - - if ! command -v kubectl >/dev/null; then - echo -e "\nkubectl is not installed" - return - fi - - if [[ ! -f /usr/local/bin/kubectl ]]; then - echo -e "\nkubectl was not found at /usr/local/bin/kubectl, please uninstall it manually" - return - fi - - echo - read -p "Would you like to uninstall kubectl (/usr/local/bin/kubectl)? [Y/n] " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - if [ $(id -u) = 0 ]; then - rm /usr/local/bin/kubectl - else - ask_sudo - sudo rm /usr/local/bin/kubectl - fi - rm -rf $HOME/.kube - echo "✓ Uninstalled kubectl" - else - return - fi -} - -function check_dep_aws() { - set -e - - if ! command -v aws >/dev/null 2>&1; then - echo - read -p "The AWS CLI is required. Would you like cortex-installer.sh to install it? [Y/n] " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - install_aws - else - exit 1 - fi - fi - - if [ -z "$AWS_ACCESS_KEY_ID" ]; then - echo -e "\nerror: please export AWS_ACCESS_KEY_ID" - exit 1 - fi - - if [ -z "$AWS_SECRET_ACCESS_KEY" ]; then - echo -e "\nerror: please export AWS_SECRET_ACCESS_KEY" - exit 1 - fi -} - -function install_aws() { - set -e - - if command -v aws >/dev/null; then - echo "The AWS CLI is already installed" - return - fi - - check_dep_curl - check_dep_unzip - - if command -v python >/dev/null; then - py_path=$(which python) - elif command -v python3 >/dev/null; then - py_path=$(which python3) - else - echo -e "\nerror: please install python or python3 using your package manager" - exit 1 - fi - - if ! $py_path -c "import distutils.sysconfig" >/dev/null 2>&1; then - if command -v python3 >/dev/null; then - echo -e "\nerror: please install python3-distutils using your package manager" - else - echo -e "\nerror: please install python distutils" - fi - exit 1 - fi - - echo -e "\nInstalling the AWS CLI (/usr/local/bin/aws) ..." - - rm -rf $CORTEX_SH_TMP_DIR && mkdir -p $CORTEX_SH_TMP_DIR - curl -s -o $CORTEX_SH_TMP_DIR/awscli-bundle.zip https://s3.amazonaws.com/aws-cli/awscli-bundle.zip - unzip $CORTEX_SH_TMP_DIR/awscli-bundle.zip -d $CORTEX_SH_TMP_DIR >/dev/null - - if [ $(id -u) = 0 ]; then - $py_path $CORTEX_SH_TMP_DIR/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws >/dev/null - else - ask_sudo - sudo $py_path $CORTEX_SH_TMP_DIR/awscli-bundle/install -i /usr/local/aws -b /usr/local/bin/aws >/dev/null - fi - - rm -rf $CORTEX_SH_TMP_DIR - echo "✓ Installed the AWS CLI" -} - -function uninstall_aws() { - set -e - - if ! command -v aws >/dev/null; then - echo -e "\nThe AWS CLI is not installed" - return - fi - - if [[ ! -f /usr/local/bin/aws ]]; then - echo -e "\nThe AWS CLI was not found at /usr/local/bin/aws, please uninstall it manually" - return - fi - - echo - read -p "Would you like to uninstall the AWS CLI (/usr/local/bin/aws)? [Y/n] " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - if [ $(id -u) = 0 ]; then - rm -rf /usr/local/aws - rm /usr/local/bin/aws - else - ask_sudo - sudo rm -rf /usr/local/aws - sudo rm /usr/local/bin/aws - fi - rm -rf $HOME/.aws - echo "✓ Uninstalled the AWS CLI" - else - return - fi -} - -function install_eksctl() { - set -e - - if command -v eksctl >/dev/null; then - echo -e "\neksctl is already installed" - return - fi - - check_dep_curl - - echo -e "\nInstalling eksctl (/usr/local/bin/eksctl) ..." - - rm -rf $CORTEX_SH_TMP_DIR && mkdir -p $CORTEX_SH_TMP_DIR - (cd $CORTEX_SH_TMP_DIR && curl -s --location "https://github.com/weaveworks/eksctl/releases/download/0.1.21/eksctl_$(uname -s)_amd64.tar.gz" | tar xz) - chmod +x $CORTEX_SH_TMP_DIR/eksctl - - if [ $(id -u) = 0 ]; then - mv $CORTEX_SH_TMP_DIR/eksctl /usr/local/bin/eksctl - else - ask_sudo - sudo mv $CORTEX_SH_TMP_DIR/eksctl /usr/local/bin/eksctl - fi - - rm -rf $CORTEX_SH_TMP_DIR - echo "✓ Installed eksctl" -} - -function uninstall_eksctl() { - set -e - - if ! command -v eksctl >/dev/null; then - echo -e "\neksctl is not installed" - return - fi - - if [[ ! -f /usr/local/bin/eksctl ]]; then - echo -e "\neksctl was not found at /usr/local/bin/eksctl, please uninstall it manually" - return - fi - - echo - read -p "Would you like to uninstall eksctl (/usr/local/bin/eksctl)? [Y/n] " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - if [ $(id -u) = 0 ]; then - rm /usr/local/bin/eksctl - else - ask_sudo - sudo rm /usr/local/bin/eksctl - fi - echo "✓ Uninstalled eksctl" - else - return - fi -} - -function install_aws_iam_authenticator() { - set -e - - if command -v aws-iam-authenticator >/dev/null; then - echo -e "\naws-iam-authenticator is already installed" - return - fi - - check_dep_curl - - echo -e "\nInstalling aws-iam-authenticator (/usr/local/bin/aws-iam-authenticator) ..." - - rm -rf $CORTEX_SH_TMP_DIR && mkdir -p $CORTEX_SH_TMP_DIR - curl -s -o $CORTEX_SH_TMP_DIR/aws-iam-authenticator https://amazon-eks.s3-us-west-2.amazonaws.com/1.11.5/2018-12-06/bin/$PARSED_OS/amd64/aws-iam-authenticator - chmod +x $CORTEX_SH_TMP_DIR/aws-iam-authenticator - - if [ $(id -u) = 0 ]; then - mv $CORTEX_SH_TMP_DIR/aws-iam-authenticator /usr/local/bin/aws-iam-authenticator - else - ask_sudo - sudo mv $CORTEX_SH_TMP_DIR/aws-iam-authenticator /usr/local/bin/aws-iam-authenticator - fi - - rm -rf $CORTEX_SH_TMP_DIR - echo "✓ Installed aws-iam-authenticator" -} - -function uninstall_aws_iam_authenticator() { - set -e - - if ! command -v aws-iam-authenticator >/dev/null; then - echo -e "\naws-iam-authenticator is not installed" - return - fi - - if [[ ! -f /usr/local/bin/aws-iam-authenticator ]]; then - echo -e "\naws-iam-authenticator was not found at /usr/local/bin/aws-iam-authenticator, please uninstall it manually" - return - fi - - echo - read -p "Would you like to uninstall aws-iam-authenticator (/usr/local/bin/aws-iam-authenticator)? [Y/n] " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - if [ $(id -u) = 0 ]; then - rm /usr/local/bin/aws-iam-authenticator - else - ask_sudo - sudo rm /usr/local/bin/aws-iam-authenticator - fi - echo "✓ Uninstalled aws-iam-authenticator" - else - return - fi -} - -function install_cortex_cli() { - set -e - - if command -v cortex >/dev/null; then - echo "The Cortex CLI is already installed" - return - fi - - check_dep_curl - - echo -e "\nInstalling the Cortex CLI (/usr/local/bin/cortex) ..." - - rm -rf $CORTEX_SH_TMP_DIR && mkdir -p $CORTEX_SH_TMP_DIR - curl -s -o $CORTEX_SH_TMP_DIR/cortex https://s3-us-west-2.amazonaws.com/get-cortex/$CORTEX_VERSION_STABLE/cli/$PARSED_OS/cortex - chmod +x $CORTEX_SH_TMP_DIR/cortex - - if [ $(id -u) = 0 ]; then - mv $CORTEX_SH_TMP_DIR/cortex /usr/local/bin/cortex - else - ask_sudo - sudo mv $CORTEX_SH_TMP_DIR/cortex /usr/local/bin/cortex - fi - - rm -rf $CORTEX_SH_TMP_DIR - echo "✓ Installed the Cortex CLI" - - bash_profile_path=$(get_bash_profile) - if [ ! "$bash_profile_path" = "" ]; then - if ! grep -Fxq "source <(cortex completion)" "$bash_profile_path"; then - echo - read -p "Would you like to modify your bash profile ($bash_profile_path) to enable cortex command completion and the cx alias? [Y/n] " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - echo -e "\nsource <(cortex completion)" >> $bash_profile_path - echo "✓ Your bash profile ($bash_profile_path) has been updated" - echo - echo "Note: \`bash_completion\` must be installed on your system for cortex command completion to function properly" - echo - echo "Command to update your current terminal session:" - echo " source $bash_profile_path" - else - echo "Your bash profile has not been modified. If you would like to modify it manually, add this line to your bash profile:" - echo " source <(cortex completion)" - echo "Note: \`bash_completion\` must be installed on your system for cortex command completion to function properly" - fi - fi - else - echo -e "\nIf your would like to enable cortex command completion and the cx alias, add this line to your bash profile:" - echo " source <(cortex completion)" - echo "Note: \`bash_completion\` must be installed on your system for cortex command completion to function properly" - fi -} - -function uninstall_cortex_cli() { - set -e - - rm -rf $HOME/.cortex - - if ! command -v cortex >/dev/null; then - echo -e "\nThe Cortex CLI is not installed" - return - fi - - if [[ ! -f /usr/local/bin/cortex ]]; then - echo -e "\nThe Cortex CLI was not found at /usr/local/bin/cortex, please uninstall it manually" - return - fi - - if [ $(id -u) = 0 ]; then - rm /usr/local/bin/cortex - else - ask_sudo - sudo rm /usr/local/bin/cortex - fi - echo -e "\n✓ Uninstalled the Cortex CLI" - - bash_profile_path=$(get_bash_profile) - if [ ! "$bash_profile_path" = "" ]; then - if grep -Fxq "source <(cortex completion)" "$bash_profile_path"; then - echo - read -p "Would you like to remove \"source <(cortex completion)\" from your bash profile ($bash_profile_path)? [Y/n] " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - sed '/^source <(cortex completion)$/d' "$bash_profile_path" > "${bash_profile_path}_cortex_modified" && mv -f "${bash_profile_path}_cortex_modified" "$bash_profile_path" - echo "✓ Your bash profile ($bash_profile_path) has been updated" - fi - fi - fi -} - -function get_bash_profile() { - if [ "$PARSED_OS" = "darwin" ]; then - if [ -f $HOME/.bash_profile ]; then - echo $HOME/.bash_profile - return - elif [ -f $HOME/.bashrc ]; then - echo $HOME/.bashrc - return - fi - else - if [ -f $HOME/.bashrc ]; then - echo $HOME/.bashrc - return - elif [ -f $HOME/.bash_profile ]; then - echo $HOME/.bash_profile - return - fi - fi - - echo "" -} - -function ask_sudo() { - if ! sudo -n true 2>/dev/null; then - echo -e "\nPlease enter your sudo password" - fi -} - -function prompt_for_telemetry() { - if [ "$CORTEX_ENABLE_TELEMETRY" != "true" ] && [ "$CORTEX_ENABLE_TELEMETRY" != "false" ]; then - while true - do - echo - read -p "Would you like to help improve Cortex by anonymously sending error reports and usage stats to the dev team? [Y/n] " -n 1 -r - echo - if [[ $REPLY =~ ^[Yy]$ ]]; then - export CORTEX_ENABLE_TELEMETRY=true - break - elif [[ $REPLY =~ ^[Nn]$ ]]; then - export CORTEX_ENABLE_TELEMETRY=false - break - fi - echo "Unexpected value, please enter \"Y\" or \"n\"" - done - fi -} - -###################### -### ARG PROCESSING ### -###################### - -arg1=${1:-""} -arg2=${2:-""} -arg3=${3:-""} - -if [ -z "$arg1" ]; then - show_help - exit 0 -fi - -if [ "$arg1" = "install" ]; then - if [ ! "$arg3" = "" ]; then - echo -e "\nerror: too many arguments for install command" - show_help - exit 1 - elif [ "$arg2" = "operator" ]; then - install_operator - elif [ "$arg2" = "cli" ]; then - install_cli - elif [ "$arg2" = "kubernetes-tools" ]; then - install_kubernetes_tools - elif [ "$arg2" = "" ]; then - echo -e "\nerror: missing subcommand for install" - show_help - exit 1 - else - echo -e "\nerror: invalid subcommand for install: $arg2" - show_help - exit 1 - fi -elif [ "$arg1" = "uninstall" ]; then - if [ ! "$arg3" = "" ]; then - echo -e "\nerror: too many arguments for uninstall command" - show_help - exit 1 - elif [ "$arg2" = "operator" ]; then - uninstall_operator - elif [ "$arg2" = "cli" ]; then - uninstall_cli - elif [ "$arg2" = "kubernetes-tools" ]; then - uninstall_kubernetes_tools - elif [ "$arg2" = "" ]; then - echo -e "\nerror: missing subcommand for uninstall" - show_help - exit 1 - else - echo -e "\nerror: invalid subcommand for uninstall: $arg2" - show_help - exit 1 - fi -elif [ "$arg1" = "update" ]; then - if [ ! "$arg3" = "" ]; then - echo -e "\nerror: too many arguments for update command" - show_help - exit 1 - elif [ "$arg2" = "operator" ]; then - update_operator - elif [ "$arg2" = "" ]; then - echo -e "\nerror: missing subcommand for update" - show_help - exit 1 - else - echo -e "\nerror: invalid subcommand for update: $arg2" - show_help - exit 1 - fi -elif [ "$arg1" = "get" ]; then - if [ ! "$arg3" = "" ]; then - echo -e "\nerror: too many arguments for get command" - show_help - exit 1 - elif [ "$arg2" = "endpoints" ]; then - get_endpoints - elif [ "$arg2" = "" ]; then - echo -e "\nerror: missing subcommand for get" - show_help - exit 1 - else - echo -e "\nerror: invalid subcommand for get: $arg2" - show_help - exit 1 - fi -else - echo -e "\nerror: unknown command: $arg1" - show_help - exit 1 -fi diff --git a/cortex.sh b/cortex.sh new file mode 100755 index 0000000000..cdc5c079ec --- /dev/null +++ b/cortex.sh @@ -0,0 +1,450 @@ +#!/bin/bash + +# Copyright 2019 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +#################### +### FLAG PARSING ### +#################### + +flag_help=false +positional_args=() + +while [[ $# -gt 0 ]]; do + key="$1" + case $key in + -c|--config) + export CORTEX_CONFIG="$2" + shift + shift + ;; + -h|--help) + flag_help="true" + shift + ;; + *) + positional_args+=("$1") + shift + ;; + esac +done + +set -- "${positional_args[@]}" +positional_args=() +for i in "$@"; do + case $i in + -c=*|--config=*) + export CORTEX_CONFIG="${i#*=}" + shift + ;; + -h=*|--help=*) + flag_help="true" + ;; + *) + positional_args+=("$1") + shift + ;; + esac +done + +set -- "${positional_args[@]}" +if [ "$flag_help" == "true" ]; then + show_help + exit 0 +fi + +for arg in "$@"; do + if [[ "$arg" == -* ]]; then + echo "unknown flag: $arg" + show_help + exit 1 + fi +done + +##################### +### CONFIGURATION ### +##################### + +if [ "$CORTEX_CONFIG" != "" ]; then + if [ ! -f "$CORTEX_CONFIG" ]; then + echo "cortex config file does not exist: $CORTEX_CONFIG" + fi + source $CORTEX_CONFIG +fi + +set -u + +export CORTEX_VERSION_STABLE=master + +# Defaults +random_id=$(cat /dev/urandom | LC_CTYPE=C tr -dc 'a-z0-9' | fold -w 12 | head -n 1) + +export AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID:-""}" +export AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY:-""}" + +export CORTEX_LOG_GROUP="${CORTEX_LOG_GROUP:-cortex}" +export CORTEX_BUCKET="${CORTEX_BUCKET:-cortex-$random_id}" +export CORTEX_REGION="${CORTEX_REGION:-us-west-2}" + +export CORTEX_CLUSTER_NAME="${CORTEX_CLUSTER_NAME:-cortex}" +export CORTEX_NODE_TYPE="${CORTEX_NODE_TYPE:-t3.medium}" +export CORTEX_NODES_MIN="${CORTEX_NODES_MIN:-1}" +export CORTEX_NODES_MAX="${CORTEX_NODES_MAX:-3}" + +export CORTEX_NAMESPACE="${CORTEX_NAMESPACE:-cortex}" + +export CORTEX_IMAGE_ARGO_CONTROLLER="${CORTEX_IMAGE_ARGO_CONTROLLER:-cortexlabs/argo-controller:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_ARGO_EXECUTOR="${CORTEX_IMAGE_ARGO_EXECUTOR:-cortexlabs/argo-executor:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_FLUENTD="${CORTEX_IMAGE_FLUENTD:-cortexlabs/fluentd:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_NGINX_BACKEND="${CORTEX_IMAGE_NGINX_BACKEND:-cortexlabs/nginx-backend:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_NGINX_CONTROLLER="${CORTEX_IMAGE_NGINX_CONTROLLER:-cortexlabs/nginx-controller:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_OPERATOR="${CORTEX_IMAGE_OPERATOR:-cortexlabs/operator:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_SPARK="${CORTEX_IMAGE_SPARK:-cortexlabs/spark:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_SPARK_OPERATOR="${CORTEX_IMAGE_SPARK_OPERATOR:-cortexlabs/spark-operator:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_TF_SERVE="${CORTEX_IMAGE_TF_SERVE:-cortexlabs/tf-serve:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_TF_TRAIN="${CORTEX_IMAGE_TF_TRAIN:-cortexlabs/tf-train:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_TF_API="${CORTEX_IMAGE_TF_API:-cortexlabs/tf-api:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_PYTHON_PACKAGER="${CORTEX_IMAGE_PYTHON_PACKAGER:-cortexlabs/python-packager:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_TF_SERVE_GPU="${CORTEX_IMAGE_TF_SERVE_GPU:-cortexlabs/tf-serve-gpu:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_TF_TRAIN_GPU="${CORTEX_IMAGE_TF_TRAIN_GPU:-cortexlabs/tf-train-gpu:$CORTEX_VERSION_STABLE}" + +export CORTEX_ENABLE_TELEMETRY="${CORTEX_ENABLE_TELEMETRY:-""}" + +########################## +### TOP-LEVEL COMMANDS ### +########################## + +function aws() { + docker run --entrypoint /root/aws.sh \ + -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ + -e CORTEX_CLUSTER_NAME=$CORTEX_CLUSTER_NAME \ + -e CORTEX_NODE_TYPE=$CORTEX_NODE_TYPE \ + -e CORTEX_NODES_MIN=$CORTEX_NODES_MIN \ + -e CORTEX_NODES_MAX=$CORTEX_NODES_MAX \ + -e CORTEX_LOG_GROUP=$CORTEX_LOG_GROUP \ + -e CORTEX_BUCKET=$CORTEX_BUCKET \ + -e CORTEX_REGION=$CORTEX_REGION \ + cortexlabs/manager +} + +function install() { + docker run --entrypoint /root/install.sh \ + -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ + -e CORTEX_CLUSTER_NAME=$CORTEX_CLUSTER_NAME \ + -e CORTEX_NAMESPACE=$CORTEX_NAMESPACE \ + -e CORTEX_LOG_GROUP=$CORTEX_LOG_GROUP \ + -e CORTEX_BUCKET=$CORTEX_BUCKET \ + -e CORTEX_REGION=$CORTEX_REGION \ + -e CORTEX_IMAGE_ARGO_CONTROLLER=$CORTEX_IMAGE_ARGO_CONTROLLER \ + -e CORTEX_IMAGE_ARGO_EXECUTOR=$CORTEX_IMAGE_ARGO_EXECUTOR \ + -e CORTEX_IMAGE_FLUENTD=$CORTEX_IMAGE_FLUENTD \ + -e CORTEX_IMAGE_NGINX_BACKEND=$CORTEX_IMAGE_NGINX_BACKEND \ + -e CORTEX_IMAGE_NGINX_CONTROLLER=$CORTEX_IMAGE_NGINX_CONTROLLER \ + -e CORTEX_IMAGE_OPERATOR=$CORTEX_IMAGE_OPERATOR \ + -e CORTEX_IMAGE_SPARK=$CORTEX_IMAGE_SPARK \ + -e CORTEX_IMAGE_SPARK_OPERATOR=$CORTEX_IMAGE_SPARK_OPERATOR \ + -e CORTEX_IMAGE_TF_SERVE=$CORTEX_IMAGE_TF_SERVE \ + -e CORTEX_IMAGE_TF_TRAIN=$CORTEX_IMAGE_TF_TRAIN \ + -e CORTEX_IMAGE_TF_API=$CORTEX_IMAGE_TF_API \ + -e CORTEX_IMAGE_PYTHON_PACKAGER=$CORTEX_IMAGE_PYTHON_PACKAGER \ + -e CORTEX_IMAGE_TF_SERVE_GPU=$CORTEX_IMAGE_TF_SERVE_GPU \ + -e CORTEX_IMAGE_TF_TRAIN_GPU=$CORTEX_IMAGE_TF_TRAIN_GPU \ + -e CORTEX_ENABLE_TELEMETRY=$CORTEX_ENABLE_TELEMETRY \ + cortexlabs/manager +} + +function uninstall() { + docker run --entrypoint /root/uninstall.sh \ + -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ + -e CORTEX_CLUSTER_NAME=$CORTEX_CLUSTER_NAME \ + -e CORTEX_NAMESPACE=$CORTEX_NAMESPACE \ + cortexlabs/manager +} + +function info() { + docker run --entrypoint /root/info.sh \ + -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ + -e CORTEX_CLUSTER_NAME=$CORTEX_CLUSTER_NAME \ + -e CORTEX_NAMESPACE=$CORTEX_NAMESPACE \ + cortexlabs/manager +} + +function update() { + docker run --entrypoint /root/update.sh \ + -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ + -e CORTEX_CLUSTER_NAME=$CORTEX_CLUSTER_NAME \ + -e CORTEX_NAMESPACE=$CORTEX_NAMESPACE \ + cortexlabs/manager +} + +################ +### CHECK OS ### +################ + +case "$OSTYPE" in + darwin*) PARSED_OS="darwin" ;; + linux*) PARSED_OS="linux" ;; + *) echo -e "\nerror: only mac and linux are supported"; exit 1 ;; +esac + +############################# +### DEPENDENCY MANAGEMENT ### +############################# + +function check_dep_curl() { + if ! command -v curl >/dev/null; then + echo -e "\nerror: please install \`curl\`" + exit 1 + fi +} + +function install_cli() { + set -e + + if command -v cortex >/dev/null; then + echo "The Cortex CLI is already installed" + return + fi + + check_dep_curl + + echo -e "\nInstalling the Cortex CLI (/usr/local/bin/cortex) ..." + + CORTEX_SH_TMP_DIR="$HOME/.cortex-sh-tmp" + rm -rf $CORTEX_SH_TMP_DIR && mkdir -p $CORTEX_SH_TMP_DIR + curl -s -o $CORTEX_SH_TMP_DIR/cortex https://s3-us-west-2.amazonaws.com/get-cortex/$CORTEX_VERSION_STABLE/cli/$PARSED_OS/cortex + chmod +x $CORTEX_SH_TMP_DIR/cortex + + if [ $(id -u) = 0 ]; then + mv $CORTEX_SH_TMP_DIR/cortex /usr/local/bin/cortex + else + ask_sudo + sudo mv $CORTEX_SH_TMP_DIR/cortex /usr/local/bin/cortex + fi + + rm -rf $CORTEX_SH_TMP_DIR + echo "✓ Installed the Cortex CLI" + + bash_profile_path=$(get_bash_profile) + if [ ! "$bash_profile_path" = "" ]; then + if ! grep -Fxq "source <(cortex completion)" "$bash_profile_path"; then + echo + read -p "Would you like to modify your bash profile ($bash_profile_path) to enable cortex command completion and the cx alias? [Y/n] " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + echo -e "\nsource <(cortex completion)" >> $bash_profile_path + echo "✓ Your bash profile ($bash_profile_path) has been updated" + echo + echo "Note: \`bash_completion\` must be installed on your system for cortex command completion to function properly" + echo + echo "Command to update your current terminal session:" + echo " source $bash_profile_path" + else + echo "Your bash profile has not been modified. If you would like to modify it manually, add this line to your bash profile:" + echo " source <(cortex completion)" + echo "Note: \`bash_completion\` must be installed on your system for cortex command completion to function properly" + fi + fi + else + echo -e "\nIf your would like to enable cortex command completion and the cx alias, add this line to your bash profile:" + echo " source <(cortex completion)" + echo "Note: \`bash_completion\` must be installed on your system for cortex command completion to function properly" + fi +} + +function uninstall_cli() { + set -e + + rm -rf $HOME/.cortex + + if ! command -v cortex >/dev/null; then + echo -e "\nThe Cortex CLI is not installed" + return + fi + + if [[ ! -f /usr/local/bin/cortex ]]; then + echo -e "\nThe Cortex CLI was not found at /usr/local/bin/cortex, please uninstall it manually" + return + fi + + if [ $(id -u) = 0 ]; then + rm /usr/local/bin/cortex + else + ask_sudo + sudo rm /usr/local/bin/cortex + fi + echo -e "\n✓ Uninstalled the Cortex CLI" + + bash_profile_path=$(get_bash_profile) + if [ ! "$bash_profile_path" = "" ]; then + if grep -Fxq "source <(cortex completion)" "$bash_profile_path"; then + echo + read -p "Would you like to remove \"source <(cortex completion)\" from your bash profile ($bash_profile_path)? [Y/n] " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + sed '/^source <(cortex completion)$/d' "$bash_profile_path" > "${bash_profile_path}_cortex_modified" && mv -f "${bash_profile_path}_cortex_modified" "$bash_profile_path" + echo "✓ Your bash profile ($bash_profile_path) has been updated" + fi + fi + fi +} + +function get_bash_profile() { + if [ "$PARSED_OS" = "darwin" ]; then + if [ -f $HOME/.bash_profile ]; then + echo $HOME/.bash_profile + return + elif [ -f $HOME/.bashrc ]; then + echo $HOME/.bashrc + return + fi + else + if [ -f $HOME/.bashrc ]; then + echo $HOME/.bashrc + return + elif [ -f $HOME/.bash_profile ]; then + echo $HOME/.bash_profile + return + fi + fi + + echo "" +} + +function ask_sudo() { + if ! sudo -n true 2>/dev/null; then + echo -e "\nPlease enter your sudo password" + fi +} + +function prompt_for_telemetry() { + if [ "$CORTEX_ENABLE_TELEMETRY" != "true" ] && [ "$CORTEX_ENABLE_TELEMETRY" != "false" ]; then + while true + do + echo + read -p "Would you like to help improve Cortex by anonymously sending error reports and usage stats to the dev team? [Y/n] " -n 1 -r + echo + if [[ $REPLY =~ ^[Yy]$ ]]; then + export CORTEX_ENABLE_TELEMETRY=true + break + elif [[ $REPLY =~ ^[Nn]$ ]]; then + export CORTEX_ENABLE_TELEMETRY=false + break + fi + echo "Unexpected value, please enter \"Y\" or \"n\"" + done + fi +} + +############ +### HELP ### +############ + +function show_help() { + echo " +Usage: + ./cortex.sh command [sub-command] [flags] + +Available Commands: + install install Cortex + uninstall uninstall Cortex + update update Cortex + info information about Cortex + + install cli install the Cortex CLI + uninstall cli uninstall the Cortex CLI + +Flags: + -c, --config path to a Cortex config file + -h, --help +" +} + +###################### +### ARG PROCESSING ### +###################### + +arg1=${1:-""} +arg2=${2:-""} +arg3=${3:-""} + +if [ -z "$arg1" ]; then + show_help + exit 0 +fi + +if [ "$arg1" = "install" ]; then + if [ ! "$arg3" = "" ]; then + echo -e "\nerror: too many arguments for install command" + show_help + exit 1 + elif [ "$arg2" = "" ]; then + aws && install && info + elif [ "$arg2" = "cli" ]; then + install_cli + elif [ "$arg2" = "" ]; then + echo -e "\nerror: missing subcommand for install" + show_help + exit 1 + else + echo -e "\nerror: invalid subcommand for install: $arg2" + show_help + exit 1 + fi +elif [ "$arg1" = "uninstall" ]; then + if [ ! "$arg3" = "" ]; then + echo -e "\nerror: too many arguments for uninstall command" + show_help + exit 1 + elif [ "$arg2" = "" ]; then + uninstall + elif [ "$arg2" = "cli" ]; then + uninstall_cli + elif [ "$arg2" = "" ]; then + echo -e "\nerror: missing subcommand for uninstall" + show_help + exit 1 + else + echo -e "\nerror: invalid subcommand for uninstall: $arg2" + show_help + exit 1 + fi +elif [ "$arg1" = "update" ]; then + if [ ! "$arg2" = "" ]; then + echo -e "\nerror: too many arguments for get command" + show_help + exit 1 + else + update && install + fi +elif [ "$arg1" = "info" ]; then + if [ ! "$arg2" = "" ]; then + echo -e "\nerror: too many arguments for get command" + show_help + exit 1 + else + info + fi +else + echo -e "\nerror: unknown command: $arg1" + show_help + exit 1 +fi diff --git a/docs/cluster/config.md b/docs/cluster/config.md index 898c5b3150..7154cc91fa 100644 --- a/docs/cluster/config.md +++ b/docs/cluster/config.md @@ -1,6 +1,6 @@ # Config -These environment variables can be modified and exported before running `cortex-installer.sh` commands. Alternatively, a configuration file may be provided to `cortex-installer.sh` via the `--config` flag (e.g. `cortex-installer.sh --config=./config.sh install operator`). Default values are shown. +These environment variables can be modified and exported before running `cortex.sh` commands. Alternatively, a configuration file may be provided to `cortex.sh` via the `--config` flag (e.g. `cortex.sh --config=./config.sh install`). Default values are shown. @@ -20,12 +20,21 @@ export CORTEX_BUCKET="cortex-[RANDOM_ID]" # The AWS region Cortex will use export CORTEX_REGION="us-west-2" +# The name of the EKS cluster Cortex will use +export CORTEX_CLUSTER_NAME="cortex" + +# The AWS node type Cortex will use +export CORTEX_NODE_TYPE="t3.medium" + +# Minimum number of nodes in the cluster +export CORTEX_NODES_MIN=1 + +# Maximum number of nodes in the cluster +export CORTEX_NODES_MAX=3 + # The name of the Kubernetes namespace Cortex will use export CORTEX_NAMESPACE="cortex" -# Flag to enable collecting error reports and usage stats. If flag is not set to either "true" or "false", you will be prompted. -export CORTEX_ENABLE_TELEMETRY="" - # Image paths export CORTEX_IMAGE_ARGO_CONTROLLER="cortexlabs/argo-controller:master" export CORTEX_IMAGE_ARGO_EXECUTOR="cortexlabs/argo-executor:master" @@ -41,4 +50,7 @@ export CORTEX_IMAGE_TF_API="cortexlabs/tf-api:master" export CORTEX_IMAGE_TF_TRAIN_GPU="cortexlabs/tf-train-gpu:master" export CORTEX_IMAGE_TF_SERVE_GPU="cortexlabs/tf-serve-gpu:master" export CORTEX_IMAGE_PYTHON_PACKAGER="cortexlabs/python-packager:master" + +# Flag to enable collecting error reports and usage stats. If flag is not set to either "true" or "false", you will be prompted. +export CORTEX_ENABLE_TELEMETRY="" ``` diff --git a/docs/cluster/install.md b/docs/cluster/install.md index 20ed12f141..563bc228a1 100644 --- a/docs/cluster/install.md +++ b/docs/cluster/install.md @@ -4,57 +4,33 @@ 1. [AWS credentials](aws.md) -## Download the install script +## Spin up Cortex in your AWS account ```bash # Download -curl -O https://raw.githubusercontent.com/cortexlabs/cortex/master/cortex-installer.sh +curl -O https://raw.githubusercontent.com/cortexlabs/cortex/master/cortex.sh # Change permissions -chmod +x cortex-installer.sh +chmod +x cortex.sh # Set AWS credentials export AWS_ACCESS_KEY_ID=*** export AWS_SECRET_ACCESS_KEY=*** -``` - -## Kubernetes - -Cortex runs on Kubernetes and requires access to `kubectl`. If you don't already have a Kubernetes cluster, [eksctl](https://eksctl.io) is a simple tool to create and manage one. - -**We recommend a minimum cluster size of 2 [t3.medium](https://aws.amazon.com/ec2/instance-types) AWS instances. Cortex may not run successfully on clusters with less compute resources.** -```bash -# Install kubectl, eksctl, and aws-iam-authenticator -./cortex-installer.sh install kubernetes-tools - -# Spin up an EKS cluster (this takes ~20 minutes; see eksctl.io for more options) -eksctl create cluster --name=cortex --nodes=2 --node-type=t3.medium -``` - -This cluster configuration will cost about $0.29 per hour in AWS fees. - -## Install the operator - -The Cortex operator is a service that runs on Kubernetes, translates declarative configuration into workloads, and orchestrates those workloads on the cluster. Its installation is configurable. For a full list of configuration options please refer to the [operator config](config.md) documentation. - -```bash -# Install the Cortex operator -./cortex-installer.sh install operator +# Install Cortex +./cortex.sh install ``` ## Install the CLI -The CLI runs on developer machines (e.g. your laptop) and communicates with the operator. - ```bash # Install the Cortex CLI -./cortex-installer.sh install cli +./cortex.sh install cli # Get the operator endpoint -./cortex-installer.sh get endpoints +./cortex.sh get endpoints # Configure the CLI cortex configure diff --git a/docs/cluster/uninstall.md b/docs/cluster/uninstall.md index 3c5dd5ebac..312dcecb75 100644 --- a/docs/cluster/uninstall.md +++ b/docs/cluster/uninstall.md @@ -1,33 +1,29 @@ # Uninstall -## Download the uninstall script +## Uninstall Cortex ```bash # Download -curl -O https://raw.githubusercontent.com/cortexlabs/cortex/master/cortex-installer.sh +curl -O https://raw.githubusercontent.com/cortexlabs/cortex/master/cortex.sh # Change permissions -chmod +x cortex-installer.sh +chmod +x cortex.sh # Set AWS credentials export AWS_ACCESS_KEY_ID=*** export AWS_SECRET_ACCESS_KEY=*** -``` - -## Uninstall the operator -```bash -# Uninstall the Cortex operator -./cortex-installer.sh uninstall operator +# Uninstall +./cortex.sh uninstall ``` ## Uninstall the CLI ```bash # Uninstall the Cortex CLI -./cortex-installer.sh uninstall cli +./cortex.sh uninstall cli ``` ## Clean up AWS @@ -43,22 +39,4 @@ aws s3 rb --force s3:// # Delete the log group aws logs delete-log-group --log-group-name cortex --region us-west-2 - -# Uninstall the AWS CLI (if you used cortex-installer.sh to install it) -sudo rm -rf /usr/local/aws && sudo rm /usr/local/bin/aws && rm -rf ~/.aws -``` - -## Spin down Kubernetes - -If you used [`eksctl`](https://eksctl.io) to create your cluster, you can use it to spin the cluster down. - -**Make sure the Cortex operator is uninstalled to prevent AWS resource deletion deadlocks.** - -```bash -# Spin down an EKS cluster -eksctl delete cluster --name=cortex -# Confirm that both eksctl CloudFormation stacks have been deleted via the AWS console - -# Uninstall kubectl, eksctl, and aws-iam-authenticator -./cortex-installer.sh uninstall kubernetes-tools ``` diff --git a/images/manager/Dockerfile b/images/manager/Dockerfile new file mode 100644 index 0000000000..8e8df5b40e --- /dev/null +++ b/images/manager/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.7-alpine3.10 + +WORKDIR /root + +RUN pip3 install awscli --upgrade --user +RUN mv .local/bin/aws /usr/local/bin/ + +RUN apk add --no-cache bash curl gettext + +RUN curl --location "https://github.com/weaveworks/eksctl/releases/download/0.1.38/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp +RUN mv /tmp/eksctl /usr/local/bin + +RUN curl -o aws-iam-authenticator https://amazon-eks.s3-us-west-2.amazonaws.com/1.13.7/2019-06-11/bin/linux/amd64/aws-iam-authenticator +RUN chmod +x ./aws-iam-authenticator +RUN mv ./aws-iam-authenticator /usr/local/bin/aws-iam-authenticator + +RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.15.0/bin/linux/amd64/kubectl +RUN chmod +x ./kubectl +RUN mv ./kubectl /usr/local/bin/kubectl + +COPY manager /root + +ENTRYPOINT ["/bin/bash"] diff --git a/manager/aws.sh b/manager/aws.sh new file mode 100755 index 0000000000..6206a1da5e --- /dev/null +++ b/manager/aws.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Copyright 2019 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +function setup_bucket() { + if ! aws s3api head-bucket --bucket $CORTEX_BUCKET --output json 2>/dev/null; then + if aws s3 ls "s3://$CORTEX_BUCKET" --output json 2>&1 | grep -q 'NoSuchBucket'; then + echo -e "\nCreating S3 bucket: $CORTEX_BUCKET" + aws s3api create-bucket --bucket $CORTEX_BUCKET \ + --region $CORTEX_REGION \ + --create-bucket-configuration LocationConstraint=$CORTEX_REGION \ + >/dev/null + else + echo -e "\nA bucket named \"${CORTEX_BUCKET}\" already exists, but you do not have access to it" + exit 1 + fi + else + echo -e "\nUsing existing S3 bucket: $CORTEX_BUCKET" + fi +} + +function setup_cloudwatch_logs() { + if ! aws logs list-tags-log-group --log-group-name $CORTEX_LOG_GROUP --region $CORTEX_REGION --output json 2>&1 | grep -q "\"tags\":"; then + echo -e "\nCreating CloudWatch log group: $CORTEX_LOG_GROUP" + aws logs create-log-group --log-group-name $CORTEX_LOG_GROUP --region $CORTEX_REGION + else + echo -e "\nUsing existing CloudWatch log group: $CORTEX_LOG_GROUP" + fi +} + +echo "Installing Cortex ... (this will about 20 minutes)" + +eksctl create cluster --name=cortex --asg-access --node-type=$CORTEX_NODE_TYPE --nodes-min=$CORTEX_NODES_MIN --nodes-max=$CORTEX_NODES_MAX + +setup_bucket +setup_cloudwatch_logs diff --git a/manager/info.sh b/manager/info.sh new file mode 100755 index 0000000000..05af77e4c1 --- /dev/null +++ b/manager/info.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Copyright 2019 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +function get_operator_endpoint() { + set -eo pipefail + kubectl -n=$CORTEX_NAMESPACE get service nginx-controller-operator -o json | tr -d '[:space:]' | sed 's/.*{\"hostname\":\"\(.*\)\".*/\1/' +} + +function get_apis_endpoint() { + set -eo pipefail + kubectl -n=$CORTEX_NAMESPACE get service nginx-controller-apis -o json | tr -d '[:space:]' | sed 's/.*{\"hostname\":\"\(.*\)\".*/\1/' +} + +eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER_NAME >/dev/null 2>&1 + +operator_endpoint=$(get_operator_endpoint) +apis_endpoint=$(get_apis_endpoint) + +echo +echo "Operator endpoint: $operator_endpoint" +echo "APIs endpoint: $apis_endpoint" diff --git a/manager/install.sh b/manager/install.sh new file mode 100755 index 0000000000..413492c33f --- /dev/null +++ b/manager/install.sh @@ -0,0 +1,136 @@ +#!/bin/bash + +# Copyright 2019 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +function setup_configmap() { + kubectl -n=$CORTEX_NAMESPACE create configmap 'cortex-config' \ + --from-literal='LOG_GROUP'=$CORTEX_LOG_GROUP \ + --from-literal='BUCKET'=$CORTEX_BUCKET \ + --from-literal='REGION'=$CORTEX_REGION \ + --from-literal='NAMESPACE'=$CORTEX_NAMESPACE \ + --from-literal='IMAGE_OPERATOR'=$CORTEX_IMAGE_OPERATOR \ + --from-literal='IMAGE_SPARK'=$CORTEX_IMAGE_SPARK \ + --from-literal='IMAGE_TF_TRAIN'=$CORTEX_IMAGE_TF_TRAIN \ + --from-literal='IMAGE_TF_SERVE'=$CORTEX_IMAGE_TF_SERVE \ + --from-literal='IMAGE_TF_API'=$CORTEX_IMAGE_TF_API \ + --from-literal='IMAGE_PYTHON_PACKAGER'=$CORTEX_IMAGE_PYTHON_PACKAGER \ + --from-literal='IMAGE_TF_TRAIN_GPU'=$CORTEX_IMAGE_TF_TRAIN_GPU \ + --from-literal='IMAGE_TF_SERVE_GPU'=$CORTEX_IMAGE_TF_SERVE_GPU \ + --from-literal='ENABLE_TELEMETRY'=$CORTEX_ENABLE_TELEMETRY \ + -o yaml --dry-run | kubectl apply -f - >/dev/null +} + +function setup_secrets() { + kubectl -n=$CORTEX_NAMESPACE create secret generic 'aws-credentials' \ + --from-literal='AWS_ACCESS_KEY_ID'=$AWS_ACCESS_KEY_ID \ + --from-literal='AWS_SECRET_ACCESS_KEY'=$AWS_SECRET_ACCESS_KEY \ + -o yaml --dry-run | kubectl apply -f - >/dev/null +} + +function validate_cortex() { + set +e + + echo -en "\nWaiting for the Cortex operator to be ready " + + operator_load_balancer="waiting" + api_load_balancer="waiting" + operator_endpoint_reachable="waiting" + operator_pod_ready_cycles=0 + operator_endpoint="" + + while true; do + echo -n "." + sleep 5 + + operator_pod_name=$(kubectl -n=$CORTEX_NAMESPACE get pods -o=name --sort-by=.metadata.creationTimestamp | grep "^pod/operator-" | tail -1) + if [ "$operator_pod_name" == "" ]; then + operator_pod_ready_cycles=0 + else + is_ready=$(kubectl -n=$CORTEX_NAMESPACE get "$operator_pod_name" -o jsonpath='{.status.containerStatuses[0].ready}') + if [ "$is_ready" == "true" ]; then + ((operator_pod_ready_cycles++)) + else + operator_pod_ready_cycles=0 + fi + fi + + if [ "$operator_load_balancer" != "ready" ]; then + out=$(kubectl -n=$CORTEX_NAMESPACE get service nginx-controller-operator -o json | tr -d '[:space:]') + if [[ $out != *'"loadBalancer":{"ingress":[{"'* ]]; then + continue + fi + operator_load_balancer="ready" + fi + + if [ "$api_load_balancer" != "ready" ]; then + out=$(kubectl -n=$CORTEX_NAMESPACE get service nginx-controller-apis -o json | tr -d '[:space:]') + if [[ $out != *'"loadBalancer":{"ingress":[{"'* ]]; then + continue + fi + api_load_balancer="ready" + fi + + if [ "$operator_endpoint" = "" ]; then + operator_endpoint=$(kubectl -n=$CORTEX_NAMESPACE get service nginx-controller-operator -o json | tr -d '[:space:]' | sed 's/.*{\"hostname\":\"\(.*\)\".*/\1/') + fi + + if [ "$operator_endpoint_reachable" != "ready" ]; then + if ! curl $operator_endpoint >/dev/null 2>&1; then + continue + fi + operator_endpoint_reachable="ready" + fi + + if [ "$operator_pod_ready_cycles" == "0" ] && [ "$operator_pod_name" != "" ]; then + num_restart=$(kubectl -n=$CORTEX_NAMESPACE get "$operator_pod_name" -o jsonpath='{.status.containerStatuses[0].restartCount}') + if [[ $num_restart -ge 2 ]]; then + echo -e "\n\nAn error occurred when starting the Cortex operator. View the logs with:" + echo " kubectl logs $operator_pod_name --namespace=$CORTEX_NAMESPACE" + exit 1 + fi + continue + fi + + if [[ $operator_pod_ready_cycles -lt 3 ]]; then + continue + fi + + echo " ✓" + break + done + + echo -e "\nCortex is ready!" + + if command -v cortex >/dev/null; then + echo -e "\nPlease run \`cortex configure\` to make sure your CLI is configured correctly" + fi +} + +eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER_NAME >/dev/null 2>&1 + +envsubst < manifests/namespace.yaml | kubectl apply -f - >/dev/null + +setup_configmap +setup_secrets + +envsubst < manifests/spark.yaml | kubectl apply -f - >/dev/null +envsubst < manifests/argo.yaml | kubectl apply -f - >/dev/null +envsubst < manifests/nginx.yaml | kubectl apply -f - >/dev/null +envsubst < manifests/fluentd.yaml | kubectl apply -f - >/dev/null +envsubst < manifests/operator.yaml | kubectl apply -f - >/dev/null + +validate_cortex diff --git a/manager/manifests/argo.yaml b/manager/manifests/argo.yaml new file mode 100644 index 0000000000..b4f6271583 --- /dev/null +++ b/manager/manifests/argo.yaml @@ -0,0 +1,132 @@ +# Copyright 2019 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: argo-executor + namespace: $CORTEX_NAMESPACE +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: argo-executor + namespace: $CORTEX_NAMESPACE +subjects: +- kind: ServiceAccount + name: argo-executor + namespace: $CORTEX_NAMESPACE +roleRef: + kind: ClusterRole + name: cluster-admin + apiGroup: rbac.authorization.k8s.io +--- + +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + name: workflows.argoproj.io + namespace: $CORTEX_NAMESPACE +spec: + group: argoproj.io + names: + kind: Workflow + plural: workflows + shortNames: + - wf + scope: Namespaced + version: v1alpha1 +--- + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: argo-controller + namespace: $CORTEX_NAMESPACE +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: argo-controller + namespace: $CORTEX_NAMESPACE +rules: +- apiGroups: [""] + resources: [pods, pods/exec] + verbs: [create, get, list, watch, update, patch, delete] +- apiGroups: [""] + resources: [configmaps] + verbs: [get, watch, list] +- apiGroups: [""] + resources: [persistentvolumeclaims] + verbs: [create, delete] +- apiGroups: [argoproj.io] + resources: [workflows, workflows/finalizers] + verbs: [get, list, watch, update, patch, delete] +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: argo + namespace: $CORTEX_NAMESPACE +subjects: +- kind: ServiceAccount + name: argo-controller + namespace: $CORTEX_NAMESPACE +roleRef: + kind: Role + name: argo-controller + apiGroup: rbac.authorization.k8s.io +--- + +apiVersion: v1 +kind: ConfigMap +metadata: + name: argo-controller + namespace: $CORTEX_NAMESPACE +data: + config: | + namespace: $CORTEX_NAMESPACE +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: argo-controller + namespace: $CORTEX_NAMESPACE +spec: + selector: + matchLabels: + app: argo-controller + template: + metadata: + labels: + app: argo-controller + spec: + containers: + - args: + - --configmap + - argo-controller + - --executor-image + - $CORTEX_IMAGE_ARGO_EXECUTOR + - --executor-image-pull-policy + - Always + command: + - workflow-controller + image: $CORTEX_IMAGE_ARGO_CONTROLLER + imagePullPolicy: Always + name: argo-controller + serviceAccountName: argo-controller diff --git a/manager/manifests/fluentd.yaml b/manager/manifests/fluentd.yaml new file mode 100644 index 0000000000..54cf07b958 --- /dev/null +++ b/manager/manifests/fluentd.yaml @@ -0,0 +1,138 @@ +# Copyright 2019 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: fluentd + namespace: $CORTEX_NAMESPACE + labels: + app: fluentd +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: fluentd + namespace: $CORTEX_NAMESPACE +rules: +- apiGroups: [\"\"] + resources: [pods] + verbs: [get, list, watch] +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: fluentd + namespace: $CORTEX_NAMESPACE +subjects: +- kind: ServiceAccount + name: fluentd + namespace: $CORTEX_NAMESPACE +roleRef: + kind: Role + name: fluentd + apiGroup: rbac.authorization.k8s.io +--- + +apiVersion: v1 +kind: ConfigMap +metadata: + name: fluentd + namespace: $CORTEX_NAMESPACE +data: + fluent.conf: | + + @type null + + + @type tail + enable_stat_watcher false + path /var/log/containers/**_$CORTEX_NAMESPACE}_**.lo + pos_file /var/log/fluentd-containers.log.pos + time_format %Y-%m-%dT%H:%M:%S.%NZ + tag * + format json + read_from_head true + + + @type cloudwatch_logs + log_group_name \"#{ENV['LOG_GROUP_NAME']}\" + auto_create_stream true + use_tag_as_stream true + +--- + +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: fluentd + namespace: $CORTEX_NAMESPACE +spec: + template: + metadata: + labels: + app: fluentd + spec: + serviceAccountName: fluentd + initContainers: + - name: copy-fluentd-config + image: busybox + command: ['sh', '-c', 'cp /config-volume/* /etc/fluentd'] + volumeMounts: + - name: config-volume + mountPath: /config-volume + - name: config + mountPath: /etc/fluentd + containers: + - name: fluentd + image: $CORTEX_IMAGE_FLUENTD + imagePullPolicy: Always + env: + - name: AWS_REGION + value: $CORTEX_REGION + - name: LOG_GROUP_NAME + value: $CORTEX_LOG_GROUP + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_SECRET_ACCESS_KEY + volumeMounts: + - name: varlog + mountPath: /var/log + - name: varlibdockercontainers + mountPath: /var/lib/docker/containers + readOnly: true + - name: config + mountPath: /fluentd/etc + terminationGracePeriodSeconds: 30 + volumes: + - name: varlog + hostPath: + path: /var/log + - name: varlibdockercontainers + hostPath: + path: /var/lib/docker/containers + - name: config + emptyDir: {} + - name: config-volume + configMap: + name: fluentd diff --git a/manager/manifests/namespace.yaml b/manager/manifests/namespace.yaml new file mode 100644 index 0000000000..2bb3d82f47 --- /dev/null +++ b/manager/manifests/namespace.yaml @@ -0,0 +1,18 @@ +# Copyright 2019 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Namespace +metadata: + name: $CORTEX_NAMESPACE diff --git a/manager/manifests/nginx.yaml b/manager/manifests/nginx.yaml new file mode 100644 index 0000000000..c554bc4652 --- /dev/null +++ b/manager/manifests/nginx.yaml @@ -0,0 +1,418 @@ +# Copyright 2019 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: nginx + namespace: $CORTEX_NAMESPACE +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: nginx + namespace: $CORTEX_NAMESPACE +rules: + - apiGroups: [""] + resources: [endpoints, pods, secrets] + verbs: [list, watch] + - apiGroups: [""] + resources: [nodes, services, ingresses] + verbs: [get, list, watch] + - apiGroups: [""] + resources: [events] + verbs: [create, patch] + - apiGroups: ["extensions"] + resources: [ingresses] + verbs: [get, list, watch] + - apiGroups: ["extensions"] + resources: [ingresses/status] + verbs: [update] + - apiGroups: [""] + resources: [pods, secrets, namespaces, endpoints] + verbs: [get] + - apiGroups: [""] + resources: [configmaps] + resourceNames: + # Defaults to "-" + # Here: "-" + # This has to be adapted if you change either parameter + # when launching the nginx-ingress-controller. + - "ingress-controller-leader-operator" + - "ingress-controller-leader-apis" + verbs: [get, update] + - apiGroups: [""] + resources: [configmaps] + verbs: [get, list, watch, create] +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: nginx + namespace: $CORTEX_NAMESPACE +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: nginx +subjects: + - kind: ServiceAccount + name: nginx + namespace: $CORTEX_NAMESPACE +--- + +kind: ConfigMap +apiVersion: v1 +metadata: + name: nginx-configuration + namespace: $CORTEX_NAMESPACE +data: + use-proxy-protocol: "true" +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nginx-backend-operator + labels: + app.kubernetes.io/name: nginx-backend-operator + app.kubernetes.io/part-of: ingress-nginx + namespace: $CORTEX_NAMESPACE +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: nginx-backend-operator + app.kubernetes.io/part-of: ingress-nginx + template: + metadata: + labels: + app.kubernetes.io/name: nginx-backend-operator + app.kubernetes.io/part-of: ingress-nginx + spec: + terminationGracePeriodSeconds: 60 + containers: + - name: nginx-backend-operator + # Any image is permissible as long as: + # 1. It serves a 404 page at / + # 2. It serves 200 on a /healthz endpoint + image: $CORTEX_IMAGE_NGINX_BACKEND + imagePullPolicy: Always + livenessProbe: + httpGet: + path: /healthz + port: 8080 + scheme: HTTP + initialDelaySeconds: 30 + timeoutSeconds: 5 + ports: + - containerPort: 8080 + resources: + limits: + cpu: 10m + memory: 20Mi + requests: + cpu: 10m + memory: 20Mi +--- + +apiVersion: v1 +kind: Service +metadata: + name: nginx-backend-operator + namespace: $CORTEX_NAMESPACE + labels: + app.kubernetes.io/name: nginx-backend-operator + app.kubernetes.io/part-of: ingress-nginx +spec: + ports: + - port: 80 + targetPort: 8080 + selector: + app.kubernetes.io/name: nginx-backend-operator + app.kubernetes.io/part-of: ingress-nginx +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nginx-controller-operator + namespace: $CORTEX_NAMESPACE + labels: + app.kubernetes.io/name: nginx-controller-operator + app.kubernetes.io/part-of: ingress-nginx +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: nginx-controller-operator + app.kubernetes.io/part-of: ingress-nginx + template: + metadata: + labels: + app.kubernetes.io/name: nginx-controller-operator + app.kubernetes.io/part-of: ingress-nginx + spec: + serviceAccountName: nginx + containers: + - name: nginx-controller + image: $CORTEX_IMAGE_NGINX_CONTROLLER + imagePullPolicy: Always + args: + - /nginx-ingress-controller + - --watch-namespace=$CORTEX_NAMESPACE + - --default-backend-service=$CORTEX_NAMESPACE/nginx-backend-operator + - --configmap=$CORTEX_NAMESPACE/nginx-configuration + - --publish-service=$CORTEX_NAMESPACE/nginx-controller-operator + - --annotations-prefix=nginx.ingress.kubernetes.io + - --ingress-class=operator + securityContext: + capabilities: + drop: + - ALL + add: + - NET_BIND_SERVICE + # www-data -> 33 + runAsUser: 33 + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + ports: + - name: http + containerPort: 80 + - name: https + containerPort: 443 + livenessProbe: + failureThreshold: 3 + httpGet: + path: /healthz + port: 10254 + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: + failureThreshold: 3 + httpGet: + path: /healthz + port: 10254 + scheme: HTTP + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 +--- + +kind: Service +apiVersion: v1 +metadata: + name: nginx-controller-operator + namespace: $CORTEX_NAMESPACE + labels: + app.kubernetes.io/name: nginx-controller-operator + app.kubernetes.io/part-of: ingress-nginx + annotations: + # Enable PROXY protocol + service.beta.kubernetes.io/aws-load-balancer-proxy-protocol: '*' + # Ensure the ELB idle timeout is less than nginx keep-alive timeout. By default, + # NGINX keep-alive is set to 75s. If using WebSockets, the value will need to be + # increased to '3600' to avoid any potential issues. + service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: '60' +spec: + type: LoadBalancer + selector: + app.kubernetes.io/name: nginx-controller-operator + app.kubernetes.io/part-of: ingress-nginx + ports: + - name: http + port: 80 + targetPort: http + - name: https + port: 443 + targetPort: https +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nginx-backend-apis + labels: + app.kubernetes.io/name: nginx-backend-apis + app.kubernetes.io/part-of: ingress-nginx + namespace: $CORTEX_NAMESPACE +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: nginx-backend-apis + app.kubernetes.io/part-of: ingress-nginx + template: + metadata: + labels: + app.kubernetes.io/name: nginx-backend-apis + app.kubernetes.io/part-of: ingress-nginx + spec: + terminationGracePeriodSeconds: 60 + containers: + - name: nginx-backend-apis + # Any image is permissible as long as: + # 1. It serves a 404 page at / + # 2. It serves 200 on a /healthz endpoint + image: $CORTEX_IMAGE_NGINX_BACKEND + imagePullPolicy: Always + livenessProbe: + httpGet: + path: /healthz + port: 8080 + scheme: HTTP + initialDelaySeconds: 30 + timeoutSeconds: 5 + ports: + - containerPort: 8080 + resources: + limits: + cpu: 10m + memory: 20Mi + requests: + cpu: 10m + memory: 20Mi +--- + +apiVersion: v1 +kind: Service +metadata: + name: nginx-backend-apis + namespace: $CORTEX_NAMESPACE + labels: + app.kubernetes.io/name: nginx-backend-apis + app.kubernetes.io/part-of: ingress-nginx +spec: + ports: + - port: 80 + targetPort: 8080 + selector: + app.kubernetes.io/name: nginx-backend-apis + app.kubernetes.io/part-of: ingress-nginx +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nginx-controller-apis + namespace: $CORTEX_NAMESPACE + labels: + app.kubernetes.io/name: nginx-backend-apis + app.kubernetes.io/part-of: ingress-nginx +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: nginx-backend-apis + app.kubernetes.io/part-of: ingress-nginx + template: + metadata: + labels: + app.kubernetes.io/name: nginx-backend-apis + app.kubernetes.io/part-of: ingress-nginx + spec: + serviceAccountName: nginx + containers: + - name: nginx-controller + image: $CORTEX_IMAGE_NGINX_CONTROLLER + imagePullPolicy: Always + args: + - /nginx-ingress-controller + - --watch-namespace=$CORTEX_NAMESPACE + - --default-backend-service=$CORTEX_NAMESPACE/nginx-backend-apis + - --configmap=$CORTEX_NAMESPACE/nginx-configuration + - --publish-service=$CORTEX_NAMESPACE/nginx-backend-apis + - --annotations-prefix=nginx.ingress.kubernetes.io + - --ingress-class=apis + securityContext: + capabilities: + drop: + - ALL + add: + - NET_BIND_SERVICE + # www-data -> 33 + runAsUser: 33 + env: + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + ports: + - name: http + containerPort: 80 + - name: https + containerPort: 443 + livenessProbe: + failureThreshold: 3 + httpGet: + path: /healthz + port: 10254 + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + readinessProbe: + failureThreshold: 3 + httpGet: + path: /healthz + port: 10254 + scheme: HTTP + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 +--- + +kind: Service +apiVersion: v1 +metadata: + name: nginx-controller-apis + namespace: $CORTEX_NAMESPACE + labels: + app.kubernetes.io/name: nginx-backend-apis + app.kubernetes.io/part-of: ingress-nginx + annotations: + # Enable PROXY protocol + service.beta.kubernetes.io/aws-load-balancer-proxy-protocol: '*' + # Ensure the ELB idle timeout is less than nginx keep-alive timeout. By default, + # NGINX keep-alive is set to 75s. If using WebSockets, the value will need to be + # increased to '3600' to avoid any potential issues. + service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: '60' +spec: + type: LoadBalancer + selector: + app.kubernetes.io/name: nginx-backend-apis + app.kubernetes.io/part-of: ingress-nginx + ports: + - name: http + port: 80 + targetPort: http + - name: https + port: 443 + targetPort: https diff --git a/manager/manifests/operator.yaml b/manager/manifests/operator.yaml new file mode 100644 index 0000000000..16669dd2b1 --- /dev/null +++ b/manager/manifests/operator.yaml @@ -0,0 +1,111 @@ +# Copyright 2019 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: operator + namespace: $CORTEX_NAMESPACE +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: operator + namespace: $CORTEX_NAMESPACE +subjects: +- kind: ServiceAccount + name: operator + namespace: $CORTEX_NAMESPACE +roleRef: + kind: ClusterRole + name: cluster-admin + apiGroup: rbac.authorization.k8s.io +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: operator + namespace: $CORTEX_NAMESPACE + labels: + workloadType: operator +spec: + replicas: 1 + selector: + matchLabels: + workloadId: operator + template: + metadata: + labels: + workloadId: operator + workloadType: operator + spec: + containers: + - name: operator + image: $CORTEX_IMAGE_OPERATOR + imagePullPolicy: Always + env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: aws-credentials + key: AWS_SECRET_ACCESS_KEY + volumeMounts: + - name: cortex-config + mountPath: /configs/cortex + volumes: + - name: cortex-config + configMap: + name: cortex-config + serviceAccountName: operator +--- + +kind: Service +apiVersion: v1 +metadata: + name: operator + namespace: $CORTEX_NAMESPACE + labels: + workloadType: operator +spec: + selector: + workloadId: operator + ports: + - port: 8888 + targetPort: 8888 +--- + +apiVersion: extensions/v1beta1 +kind: Ingress +metadata: + name: operator + namespace: $CORTEX_NAMESPACE + labels: + workloadType: operator + annotations: + kubernetes.io/ingress.class: operator +spec: + rules: + - http: + paths: + - path: / + backend: + serviceName: operator + servicePort: 8888 diff --git a/manager/manifests/spark.yaml b/manager/manifests/spark.yaml new file mode 100644 index 0000000000..f15a393cf8 --- /dev/null +++ b/manager/manifests/spark.yaml @@ -0,0 +1,331 @@ +# Copyright 2019 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: spark-operator + namespace: $CORTEX_NAMESPACE +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: spark-operator + namespace: $CORTEX_NAMESPACE +rules: +- apiGroups: [""] + resources: [pods] + verbs: ["*"] +- apiGroups: [""] + resources: [services, configmaps, secrets] + verbs: [create, get, delete] +- apiGroups: [extensions] + resources: [ingresses] + verbs: [create, get, delete] +- apiGroups: [""] + resources: [nodes] + verbs: [get] +- apiGroups: [""] + resources: [events] + verbs: [create, update, patch] +- apiGroups: [apiextensions.k8s.io] + resources: [customresourcedefinitions] + verbs: [create, get, update, delete] +- apiGroups: [admissionregistration.k8s.io] + resources: [mutatingwebhookconfigurations] + verbs: [create, get, update, delete] +- apiGroups: [sparkoperator.k8s.io] + resources: [sparkapplications, scheduledsparkapplications] + verbs: ["*"] +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: spark-operator + namespace: $CORTEX_NAMESPACE +subjects: + - kind: ServiceAccount + name: spark-operator + namespace: $CORTEX_NAMESPACE +roleRef: + kind: Role + name: spark-operator + apiGroup: rbac.authorization.k8s.io +--- + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: spark-operator + namespace: $CORTEX_NAMESPACE + labels: + app.kubernetes.io/name: spark-operator + app.kubernetes.io/version: v2.4.0-v1alpha1 +spec: + replicas: 1 + selector: + matchLabels: + app.kubernetes.io/name: spark-operator + app.kubernetes.io/version: v2.4.0-v1alpha1 + strategy: + type: Recreate + template: + metadata: + labels: + app.kubernetes.io/name: spark-operator + app.kubernetes.io/version: v2.4.0-v1alpha1 + initializers: + pending: [] + spec: + serviceAccountName: spark-operator + containers: + - name: spark-operator + image: $CORTEX_IMAGE_SPARK_OPERATOR + imagePullPolicy: Always + command: ["/usr/bin/spark-operator"] + args: + - -namespace=$CORTEX_NAMESPACE + - -install-crds=false + - -logtostderr +--- + +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + name: sparkapplications.sparkoperator.k8s.io +spec: + group: sparkoperator.k8s.io + names: + kind: SparkApplication + listKind: SparkApplicationList + plural: sparkapplications + shortNames: + - sparkapp + singular: sparkapplication + scope: Namespaced + validation: + openAPIV3Schema: + properties: + spec: + properties: + deps: + properties: + downloadTimeout: + minimum: 1 + type: integer + maxSimultaneousDownloads: + minimum: 1 + type: integer + driver: + properties: + cores: + exclusiveMinimum: true + minimum: 0 + type: number + podName: + pattern: '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*' + executor: + properties: + cores: + exclusiveMinimum: true + minimum: 0 + type: number + instances: + minimum: 1 + type: integer + mode: + enum: + - cluster + - client + monitoring: + properties: + prometheus: + properties: + port: + maximum: 49151 + minimum: 1024 + type: integer + pythonVersion: + enum: + - "2" + - "3" + restartPolicy: + properties: + onFailureRetries: + minimum: 0 + type: integer + onFailureRetryInterval: + minimum: 1 + type: integer + onSubmissionFailureRetries: + minimum: 0 + type: integer + onSubmissionFailureRetryInterval: + minimum: 1 + type: integer + type: + enum: + - Never + - OnFailure + - Always + type: + enum: + - Java + - Scala + - Python + - R + version: v1alpha1 +--- + +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + name: scheduledsparkapplications.sparkoperator.k8s.io +spec: + group: sparkoperator.k8s.io + names: + kind: ScheduledSparkApplication + listKind: ScheduledSparkApplicationList + plural: scheduledsparkapplications + shortNames: + - scheduledsparkapp + singular: scheduledsparkapplication + scope: Namespaced + validation: + openAPIV3Schema: + properties: + spec: + properties: + concurrencyPolicy: + enum: + - Allow + - Forbid + - Replace + failedRunHistoryLimit: + minimum: 1 + type: integer + schedule: + type: string + successfulRunHistoryLimit: + minimum: 1 + type: integer + template: + properties: + deps: + properties: + downloadTimeout: + minimum: 1 + type: integer + maxSimultaneousDownloads: + minimum: 1 + type: integer + driver: + properties: + cores: + exclusiveMinimum: true + minimum: 0 + type: number + podName: + pattern: '[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*' + executor: + properties: + cores: + exclusiveMinimum: true + minimum: 0 + type: number + instances: + minimum: 1 + type: integer + mode: + enum: + - cluster + - client + monitoring: + properties: + prometheus: + properties: + port: + maximum: 49151 + minimum: 1024 + type: integer + pythonVersion: + enum: + - "2" + - "3" + restartPolicy: + properties: + onFailureRetries: + minimum: 0 + type: integer + onFailureRetryInterval: + minimum: 1 + type: integer + onSubmissionFailureRetries: + minimum: 0 + type: integer + onSubmissionFailureRetryInterval: + minimum: 1 + type: integer + type: + enum: + - Never + - OnFailure + - Always + type: + enum: + - Java + - Scala + - Python + - R + version: v1alpha1 +--- + +apiVersion: v1 +kind: ServiceAccount +metadata: + name: spark + namespace: $CORTEX_NAMESPACE +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: spark + namespace: $CORTEX_NAMESPACE +rules: +- apiGroups: + - "" # "" indicates the core API group + resources: [pods] + verbs: ["*"] +- apiGroups: + - "" # "" indicates the core API group + resources: [services] + verbs: ["*"] +--- + +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: spark + namespace: $CORTEX_NAMESPACE +subjects: +- kind: ServiceAccount + name: spark + namespace: $CORTEX_NAMESPACE +roleRef: + kind: Role + name: spark + apiGroup: rbac.authorization.k8s.io diff --git a/manager/uninstall.sh b/manager/uninstall.sh new file mode 100755 index 0000000000..b599dac713 --- /dev/null +++ b/manager/uninstall.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +# Copyright 2019 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER_NAME >/dev/null 2>&1 + +echo +echo "Uninstalling Cortex ... (this will take a few minutes)" + +# Remove finalizers on sparkapplications (they sometimes create deadlocks) +if kubectl get namespace $CORTEX_NAMESPACE >/dev/null 2>&1 && kubectl get customresourcedefinition sparkapplications.sparkoperator.k8s.io >/dev/null 2>&1; then + set +e + kubectl -n=$CORTEX_NAMESPACE get sparkapplications.sparkoperator.k8s.io -o name | xargs -L1 \ + kubectl -n=$CORTEX_NAMESPACE patch -p '{"metadata":{"finalizers": []}}' --type=merge >/dev/null 2>&1 + set -e +fi + +kubectl delete --ignore-not-found=true customresourcedefinition scheduledsparkapplications.sparkoperator.k8s.io >/dev/null 2>&1 +kubectl delete --ignore-not-found=true customresourcedefinition sparkapplications.sparkoperator.k8s.io >/dev/null 2>&1 +kubectl delete --ignore-not-found=true customresourcedefinition workflows.argoproj.io >/dev/null 2>&1 +kubectl delete --ignore-not-found=true namespace $CORTEX_NAMESPACE >/dev/null 2>&1 + +eksctl delete cluster --name=cortex + +echo "✓ Uninstalled Cortex" diff --git a/manager/update.sh b/manager/update.sh new file mode 100755 index 0000000000..f8ccfc4059 --- /dev/null +++ b/manager/update.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Copyright 2019 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +# Note: if namespace is changed, the old namespace will not be deleted +eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER_NAME >/dev/null 2>&1 + +kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true deployment operator +kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset fluentd # Pods in DaemonSets cannot be modified diff --git a/pkg/operator/endpoints/errors.go b/pkg/operator/endpoints/errors.go index 938df26c52..fa93f656c1 100644 --- a/pkg/operator/endpoints/errors.go +++ b/pkg/operator/endpoints/errors.go @@ -118,7 +118,7 @@ func ErrorAuthHeaderMalformed() error { func ErrorAuthAPIError() error { return Error{ Kind: ErrAuthAPIError, - message: "the operator is unable to verify user's credentials using AWS STS; export AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY, and run `./cortex-installer.sh update operator` to update the operator's AWS credentials", + message: "the operator is unable to verify user's credentials using AWS STS; export AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY, and run `./cortex.sh update operator` to update the operator's AWS credentials", } } diff --git a/pkg/operator/workloads/errors.go b/pkg/operator/workloads/errors.go index 1a952ad377..d507d2e566 100644 --- a/pkg/operator/workloads/errors.go +++ b/pkg/operator/workloads/errors.go @@ -107,7 +107,7 @@ func ErrorWorkflowAppMismatch() error { func ErrorCortexInstallationBroken() error { return Error{ Kind: ErrCortexInstallationBroken, - message: "cortex is out of date, or not installed properly on your cluster; run `./cortex-installer.sh uninstall operator && ./cortex-installer.sh install operator`", + message: "cortex is out of date, or not installed properly on your cluster; run `./cortex.sh uninstall operator && ./cortex-installer.sh install operator`", } } From 4778b974544da16b18c81dbdddd2bb1fc962765b Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Mon, 1 Jul 2019 14:56:54 -0700 Subject: [PATCH 2/6] Misc --- Makefile | 2 + cortex.sh | 66 +++++++++++-------- dev/registry.sh | 2 + docs/cluster/config.md | 4 +- manager/info.sh | 2 +- manager/{aws.sh => install_aws.sh} | 2 +- manager/{install.sh => install_cortex.sh} | 2 +- manager/manifests/fluentd.yaml | 4 +- manager/uninstall_aws.sh | 21 ++++++ manager/{uninstall.sh => uninstall_cortex.sh} | 9 +-- manager/{update.sh => uninstall_operator.sh} | 2 +- pkg/operator/workloads/errors.go | 2 +- 12 files changed, 74 insertions(+), 44 deletions(-) rename manager/{aws.sh => install_aws.sh} (92%) rename manager/{install.sh => install_cortex.sh} (98%) create mode 100644 manager/uninstall_aws.sh rename manager/{uninstall.sh => uninstall_cortex.sh} (87%) rename manager/{update.sh => uninstall_operator.sh} (92%) diff --git a/Makefile b/Makefile index 91e3ce5b8d..6e284b333f 100644 --- a/Makefile +++ b/Makefile @@ -145,6 +145,7 @@ ci-build-images: @./build/build-image.sh images/argo-controller argo-controller @./build/build-image.sh images/argo-executor argo-executor @./build/build-image.sh images/python-packager python-packager + @./build/build-image.sh images/manager manager ci-push-images: @./build/push-image.sh spark @@ -161,6 +162,7 @@ ci-push-images: @./build/push-image.sh argo-controller @./build/push-image.sh argo-executor @./build/push-image.sh python-packager + @./build/push-image.sh manager ci-build-cli: @./build/cli.sh diff --git a/cortex.sh b/cortex.sh index cdc5c079ec..661b2ca120 100755 --- a/cortex.sh +++ b/cortex.sh @@ -80,7 +80,8 @@ done if [ "$CORTEX_CONFIG" != "" ]; then if [ ! -f "$CORTEX_CONFIG" ]; then - echo "cortex config file does not exist: $CORTEX_CONFIG" + echo "Cortex config file does not exist: $CORTEX_CONFIG" + exit 1 fi source $CORTEX_CONFIG fi @@ -99,12 +100,11 @@ export CORTEX_LOG_GROUP="${CORTEX_LOG_GROUP:-cortex}" export CORTEX_BUCKET="${CORTEX_BUCKET:-cortex-$random_id}" export CORTEX_REGION="${CORTEX_REGION:-us-west-2}" -export CORTEX_CLUSTER_NAME="${CORTEX_CLUSTER_NAME:-cortex}" +export CORTEX_CLUSTER="${CORTEX_CLUSTER:-cortex}" +export CORTEX_NAMESPACE="${CORTEX_NAMESPACE:-cortex}" export CORTEX_NODE_TYPE="${CORTEX_NODE_TYPE:-t3.medium}" export CORTEX_NODES_MIN="${CORTEX_NODES_MIN:-1}" -export CORTEX_NODES_MAX="${CORTEX_NODES_MAX:-3}" - -export CORTEX_NAMESPACE="${CORTEX_NAMESPACE:-cortex}" +export CORTEX_NODES_MAX="${CORTEX_NODES_MAX:-5}" export CORTEX_IMAGE_ARGO_CONTROLLER="${CORTEX_IMAGE_ARGO_CONTROLLER:-cortexlabs/argo-controller:$CORTEX_VERSION_STABLE}" export CORTEX_IMAGE_ARGO_EXECUTOR="${CORTEX_IMAGE_ARGO_EXECUTOR:-cortexlabs/argo-executor:$CORTEX_VERSION_STABLE}" @@ -127,11 +127,11 @@ export CORTEX_ENABLE_TELEMETRY="${CORTEX_ENABLE_TELEMETRY:-""}" ### TOP-LEVEL COMMANDS ### ########################## -function aws() { - docker run --entrypoint /root/aws.sh \ +function install_aws() { + docker run --entrypoint /root/install_aws.sh \ -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ - -e CORTEX_CLUSTER_NAME=$CORTEX_CLUSTER_NAME \ + -e CORTEX_CLUSTER=$CORTEX_CLUSTER \ -e CORTEX_NODE_TYPE=$CORTEX_NODE_TYPE \ -e CORTEX_NODES_MIN=$CORTEX_NODES_MIN \ -e CORTEX_NODES_MAX=$CORTEX_NODES_MAX \ @@ -141,11 +141,19 @@ function aws() { cortexlabs/manager } -function install() { - docker run --entrypoint /root/install.sh \ +function uninstall_aws() { + docker run --entrypoint /root/uninstall_aws.sh \ -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ - -e CORTEX_CLUSTER_NAME=$CORTEX_CLUSTER_NAME \ + -e CORTEX_CLUSTER=$CORTEX_CLUSTER \ + cortexlabs/manager +} + +function install_cortex() { + docker run --entrypoint /root/install_operator.sh \ + -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ + -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ + -e CORTEX_CLUSTER=$CORTEX_CLUSTER \ -e CORTEX_NAMESPACE=$CORTEX_NAMESPACE \ -e CORTEX_LOG_GROUP=$CORTEX_LOG_GROUP \ -e CORTEX_BUCKET=$CORTEX_BUCKET \ @@ -168,29 +176,29 @@ function install() { cortexlabs/manager } -function uninstall() { - docker run --entrypoint /root/uninstall.sh \ +function uninstall_cortex() { + docker run --entrypoint /root/uninstall_cortex.sh \ -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ - -e CORTEX_CLUSTER_NAME=$CORTEX_CLUSTER_NAME \ + -e CORTEX_CLUSTER=$CORTEX_CLUSTER \ -e CORTEX_NAMESPACE=$CORTEX_NAMESPACE \ cortexlabs/manager } -function info() { - docker run --entrypoint /root/info.sh \ +function uninstall_operator() { + docker run --entrypoint /root/uninstall_operator.sh \ -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ - -e CORTEX_CLUSTER_NAME=$CORTEX_CLUSTER_NAME \ + -e CORTEX_CLUSTER=$CORTEX_CLUSTER \ -e CORTEX_NAMESPACE=$CORTEX_NAMESPACE \ cortexlabs/manager } -function update() { - docker run --entrypoint /root/update.sh \ +function info() { + docker run --entrypoint /root/info.sh \ -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ - -e CORTEX_CLUSTER_NAME=$CORTEX_CLUSTER_NAME \ + -e CORTEX_CLUSTER=$CORTEX_CLUSTER \ -e CORTEX_NAMESPACE=$CORTEX_NAMESPACE \ cortexlabs/manager } @@ -364,13 +372,13 @@ Usage: ./cortex.sh command [sub-command] [flags] Available Commands: - install install Cortex - uninstall uninstall Cortex - update update Cortex - info information about Cortex + install install Cortex + uninstall uninstall Cortex + update update Cortex + info information about Cortex - install cli install the Cortex CLI - uninstall cli uninstall the Cortex CLI + install cli install the Cortex CLI + uninstall cli uninstall the Cortex CLI Flags: -c, --config path to a Cortex config file @@ -397,7 +405,7 @@ if [ "$arg1" = "install" ]; then show_help exit 1 elif [ "$arg2" = "" ]; then - aws && install && info + prompt_for_telemetry && install_aws && install_cortex && info elif [ "$arg2" = "cli" ]; then install_cli elif [ "$arg2" = "" ]; then @@ -415,7 +423,7 @@ elif [ "$arg1" = "uninstall" ]; then show_help exit 1 elif [ "$arg2" = "" ]; then - uninstall + uninstall_cortex && uninstall_aws elif [ "$arg2" = "cli" ]; then uninstall_cli elif [ "$arg2" = "" ]; then @@ -433,7 +441,7 @@ elif [ "$arg1" = "update" ]; then show_help exit 1 else - update && install + uninstall_operator && install_cortex fi elif [ "$arg1" = "info" ]; then if [ ! "$arg2" = "" ]; then diff --git a/dev/registry.sh b/dev/registry.sh index 468ee09d67..e0cf1fc67d 100755 --- a/dev/registry.sh +++ b/dev/registry.sh @@ -49,6 +49,7 @@ function create_registry() { aws ecr create-repository --repository-name=cortexlabs/python-packager --region=$REGISTRY_REGION || true aws ecr create-repository --repository-name=cortexlabs/tf-train-gpu --region=$REGISTRY_REGION || true aws ecr create-repository --repository-name=cortexlabs/tf-serve-gpu --region=$REGISTRY_REGION || true + aws ecr create-repository --repository-name=cortexlabs/manager --region=$REGISTRY_REGION || true } ### HELPERS ### @@ -133,6 +134,7 @@ elif [ "$cmd" = "update" ]; then build_and_push $ROOT/images/tf-serve tf-serve latest build_and_push $ROOT/images/tf-serve-gpu tf-serve-gpu latest build_and_push $ROOT/images/python-packager python-packager latest + build_and_push $ROOT/images/manager manager latest fi build_and_push $ROOT/images/spark spark latest diff --git a/docs/cluster/config.md b/docs/cluster/config.md index 7154cc91fa..73cd0756b3 100644 --- a/docs/cluster/config.md +++ b/docs/cluster/config.md @@ -21,7 +21,7 @@ export CORTEX_BUCKET="cortex-[RANDOM_ID]" export CORTEX_REGION="us-west-2" # The name of the EKS cluster Cortex will use -export CORTEX_CLUSTER_NAME="cortex" +export CORTEX_CLUSTER="cortex" # The AWS node type Cortex will use export CORTEX_NODE_TYPE="t3.medium" @@ -30,7 +30,7 @@ export CORTEX_NODE_TYPE="t3.medium" export CORTEX_NODES_MIN=1 # Maximum number of nodes in the cluster -export CORTEX_NODES_MAX=3 +export CORTEX_NODES_MAX=5 # The name of the Kubernetes namespace Cortex will use export CORTEX_NAMESPACE="cortex" diff --git a/manager/info.sh b/manager/info.sh index 05af77e4c1..b9ef9f933f 100755 --- a/manager/info.sh +++ b/manager/info.sh @@ -26,7 +26,7 @@ function get_apis_endpoint() { kubectl -n=$CORTEX_NAMESPACE get service nginx-controller-apis -o json | tr -d '[:space:]' | sed 's/.*{\"hostname\":\"\(.*\)\".*/\1/' } -eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER_NAME >/dev/null 2>&1 +eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER >/dev/null 2>&1 operator_endpoint=$(get_operator_endpoint) apis_endpoint=$(get_apis_endpoint) diff --git a/manager/aws.sh b/manager/install_aws.sh similarity index 92% rename from manager/aws.sh rename to manager/install_aws.sh index 6206a1da5e..1d1f7f3f24 100755 --- a/manager/aws.sh +++ b/manager/install_aws.sh @@ -44,7 +44,7 @@ function setup_cloudwatch_logs() { echo "Installing Cortex ... (this will about 20 minutes)" -eksctl create cluster --name=cortex --asg-access --node-type=$CORTEX_NODE_TYPE --nodes-min=$CORTEX_NODES_MIN --nodes-max=$CORTEX_NODES_MAX +eksctl create cluster --name=$CORTEX_CLUSTER --asg-access --node-type=$CORTEX_NODE_TYPE --nodes-min=$CORTEX_NODES_MIN --nodes-max=$CORTEX_NODES_MAX setup_bucket setup_cloudwatch_logs diff --git a/manager/install.sh b/manager/install_cortex.sh similarity index 98% rename from manager/install.sh rename to manager/install_cortex.sh index 413492c33f..05742f4bb7 100755 --- a/manager/install.sh +++ b/manager/install_cortex.sh @@ -120,7 +120,7 @@ function validate_cortex() { fi } -eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER_NAME >/dev/null 2>&1 +eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER >/dev/null 2>&1 envsubst < manifests/namespace.yaml | kubectl apply -f - >/dev/null diff --git a/manager/manifests/fluentd.yaml b/manager/manifests/fluentd.yaml index 54cf07b958..9049e66b83 100644 --- a/manager/manifests/fluentd.yaml +++ b/manager/manifests/fluentd.yaml @@ -27,7 +27,7 @@ metadata: name: fluentd namespace: $CORTEX_NAMESPACE rules: -- apiGroups: [\"\"] +- apiGroups: [""] resources: [pods] verbs: [get, list, watch] --- @@ -69,7 +69,7 @@ data: @type cloudwatch_logs - log_group_name \"#{ENV['LOG_GROUP_NAME']}\" + log_group_name "#{ENV['LOG_GROUP_NAME']}" auto_create_stream true use_tag_as_stream true diff --git a/manager/uninstall_aws.sh b/manager/uninstall_aws.sh new file mode 100644 index 0000000000..7256e6f1d5 --- /dev/null +++ b/manager/uninstall_aws.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Copyright 2019 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +echo -e "\nSpinning down cluster ... (this will take a few minutes)" + +eksctl delete cluster --name=$CORTEX_CLUSTER + +echo -e "\nSpun down cluster" diff --git a/manager/uninstall.sh b/manager/uninstall_cortex.sh similarity index 87% rename from manager/uninstall.sh rename to manager/uninstall_cortex.sh index b599dac713..901b0454de 100755 --- a/manager/uninstall.sh +++ b/manager/uninstall_cortex.sh @@ -16,10 +16,9 @@ set -e -eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER_NAME >/dev/null 2>&1 +eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER >/dev/null 2>&1 -echo -echo "Uninstalling Cortex ... (this will take a few minutes)" +echo -e "\nUninstalling Cortex ..." # Remove finalizers on sparkapplications (they sometimes create deadlocks) if kubectl get namespace $CORTEX_NAMESPACE >/dev/null 2>&1 && kubectl get customresourcedefinition sparkapplications.sparkoperator.k8s.io >/dev/null 2>&1; then @@ -34,6 +33,4 @@ kubectl delete --ignore-not-found=true customresourcedefinition sparkapplication kubectl delete --ignore-not-found=true customresourcedefinition workflows.argoproj.io >/dev/null 2>&1 kubectl delete --ignore-not-found=true namespace $CORTEX_NAMESPACE >/dev/null 2>&1 -eksctl delete cluster --name=cortex - -echo "✓ Uninstalled Cortex" +echo -e "\nUninstalled Cortex" diff --git a/manager/update.sh b/manager/uninstall_operator.sh similarity index 92% rename from manager/update.sh rename to manager/uninstall_operator.sh index f8ccfc4059..0e2585d739 100755 --- a/manager/update.sh +++ b/manager/uninstall_operator.sh @@ -17,7 +17,7 @@ set -e # Note: if namespace is changed, the old namespace will not be deleted -eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER_NAME >/dev/null 2>&1 +eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER >/dev/null 2>&1 kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true deployment operator kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset fluentd # Pods in DaemonSets cannot be modified diff --git a/pkg/operator/workloads/errors.go b/pkg/operator/workloads/errors.go index d507d2e566..612d898f30 100644 --- a/pkg/operator/workloads/errors.go +++ b/pkg/operator/workloads/errors.go @@ -107,7 +107,7 @@ func ErrorWorkflowAppMismatch() error { func ErrorCortexInstallationBroken() error { return Error{ Kind: ErrCortexInstallationBroken, - message: "cortex is out of date, or not installed properly on your cluster; run `./cortex.sh uninstall operator && ./cortex-installer.sh install operator`", + message: "cortex is out of date, or not installed properly on your cluster; run `./cortex.sh update`", } } From 08cd742614a5740482261899ee02cff6573a40d2 Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Mon, 1 Jul 2019 15:09:46 -0700 Subject: [PATCH 3/6] Update Dockerfile --- images/manager/Dockerfile | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/images/manager/Dockerfile b/images/manager/Dockerfile index 8e8df5b40e..451e214527 100644 --- a/images/manager/Dockerfile +++ b/images/manager/Dockerfile @@ -2,21 +2,22 @@ FROM python:3.7-alpine3.10 WORKDIR /root +ENV PATH /root/.local/bin:$PATH + RUN pip3 install awscli --upgrade --user -RUN mv .local/bin/aws /usr/local/bin/ RUN apk add --no-cache bash curl gettext -RUN curl --location "https://github.com/weaveworks/eksctl/releases/download/0.1.38/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp -RUN mv /tmp/eksctl /usr/local/bin +RUN curl --location "https://github.com/weaveworks/eksctl/releases/download/0.1.38/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp && \ + mv /tmp/eksctl /usr/local/bin -RUN curl -o aws-iam-authenticator https://amazon-eks.s3-us-west-2.amazonaws.com/1.13.7/2019-06-11/bin/linux/amd64/aws-iam-authenticator -RUN chmod +x ./aws-iam-authenticator -RUN mv ./aws-iam-authenticator /usr/local/bin/aws-iam-authenticator +RUN curl -o aws-iam-authenticator https://amazon-eks.s3-us-west-2.amazonaws.com/1.13.7/2019-06-11/bin/linux/amd64/aws-iam-authenticator && \ + chmod +x ./aws-iam-authenticator && \ + mv ./aws-iam-authenticator /usr/local/bin/aws-iam-authenticator -RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.15.0/bin/linux/amd64/kubectl -RUN chmod +x ./kubectl -RUN mv ./kubectl /usr/local/bin/kubectl +RUN curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.15.0/bin/linux/amd64/kubectl && \ + chmod +x ./kubectl && \ + mv ./kubectl /usr/local/bin/kubectl COPY manager /root From 20fd514446bf68d00f1c9734cda725648032c42a Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Mon, 1 Jul 2019 15:29:12 -0700 Subject: [PATCH 4/6] Delete pip cache --- images/manager/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/images/manager/Dockerfile b/images/manager/Dockerfile index 451e214527..37733fca5c 100644 --- a/images/manager/Dockerfile +++ b/images/manager/Dockerfile @@ -4,7 +4,8 @@ WORKDIR /root ENV PATH /root/.local/bin:$PATH -RUN pip3 install awscli --upgrade --user +RUN pip3 install awscli --upgrade --user && \ + rm -rf /root/.cache/pip* RUN apk add --no-cache bash curl gettext From 73849f3c9004d65fbcbcf16cab97c655c6ee999d Mon Sep 17 00:00:00 2001 From: Omer Spillinger Date: Mon, 1 Jul 2019 16:19:48 -0700 Subject: [PATCH 5/6] Clean up output --- cortex.sh | 4 ++-- docs/cluster/config.md | 2 +- manager/info.sh | 3 ++- manager/install_aws.sh | 41 ++++++++++++++++++++++++++++++----- manager/install_cortex.sh | 39 +++++---------------------------- manager/uninstall_aws.sh | 5 +++-- manager/uninstall_cortex.sh | 5 +++-- manager/uninstall_operator.sh | 12 +++++++--- 8 files changed, 62 insertions(+), 49 deletions(-) mode change 100644 => 100755 manager/uninstall_aws.sh diff --git a/cortex.sh b/cortex.sh index 661b2ca120..d5b217ef23 100755 --- a/cortex.sh +++ b/cortex.sh @@ -103,7 +103,7 @@ export CORTEX_REGION="${CORTEX_REGION:-us-west-2}" export CORTEX_CLUSTER="${CORTEX_CLUSTER:-cortex}" export CORTEX_NAMESPACE="${CORTEX_NAMESPACE:-cortex}" export CORTEX_NODE_TYPE="${CORTEX_NODE_TYPE:-t3.medium}" -export CORTEX_NODES_MIN="${CORTEX_NODES_MIN:-1}" +export CORTEX_NODES_MIN="${CORTEX_NODES_MIN:-2}" export CORTEX_NODES_MAX="${CORTEX_NODES_MAX:-5}" export CORTEX_IMAGE_ARGO_CONTROLLER="${CORTEX_IMAGE_ARGO_CONTROLLER:-cortexlabs/argo-controller:$CORTEX_VERSION_STABLE}" @@ -150,7 +150,7 @@ function uninstall_aws() { } function install_cortex() { - docker run --entrypoint /root/install_operator.sh \ + docker run --entrypoint /root/install_cortex.sh \ -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ -e CORTEX_CLUSTER=$CORTEX_CLUSTER \ diff --git a/docs/cluster/config.md b/docs/cluster/config.md index 73cd0756b3..feee822ac3 100644 --- a/docs/cluster/config.md +++ b/docs/cluster/config.md @@ -27,7 +27,7 @@ export CORTEX_CLUSTER="cortex" export CORTEX_NODE_TYPE="t3.medium" # Minimum number of nodes in the cluster -export CORTEX_NODES_MIN=1 +export CORTEX_NODES_MIN=2 # Maximum number of nodes in the cluster export CORTEX_NODES_MAX=5 diff --git a/manager/info.sh b/manager/info.sh index b9ef9f933f..8cc88ae420 100755 --- a/manager/info.sh +++ b/manager/info.sh @@ -26,7 +26,8 @@ function get_apis_endpoint() { kubectl -n=$CORTEX_NAMESPACE get service nginx-controller-apis -o json | tr -d '[:space:]' | sed 's/.*{\"hostname\":\"\(.*\)\".*/\1/' } -eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER >/dev/null 2>&1 +echo +eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER operator_endpoint=$(get_operator_endpoint) apis_endpoint=$(get_apis_endpoint) diff --git a/manager/install_aws.sh b/manager/install_aws.sh index 1d1f7f3f24..d18230012e 100755 --- a/manager/install_aws.sh +++ b/manager/install_aws.sh @@ -19,7 +19,7 @@ set -e function setup_bucket() { if ! aws s3api head-bucket --bucket $CORTEX_BUCKET --output json 2>/dev/null; then if aws s3 ls "s3://$CORTEX_BUCKET" --output json 2>&1 | grep -q 'NoSuchBucket'; then - echo -e "\nCreating S3 bucket: $CORTEX_BUCKET" + echo -e "\n✓ Creating S3 bucket: $CORTEX_BUCKET" aws s3api create-bucket --bucket $CORTEX_BUCKET \ --region $CORTEX_REGION \ --create-bucket-configuration LocationConstraint=$CORTEX_REGION \ @@ -29,22 +29,53 @@ function setup_bucket() { exit 1 fi else - echo -e "\nUsing existing S3 bucket: $CORTEX_BUCKET" + echo -e "\n✓ Using existing S3 bucket: $CORTEX_BUCKET" fi } function setup_cloudwatch_logs() { if ! aws logs list-tags-log-group --log-group-name $CORTEX_LOG_GROUP --region $CORTEX_REGION --output json 2>&1 | grep -q "\"tags\":"; then - echo -e "\nCreating CloudWatch log group: $CORTEX_LOG_GROUP" + echo -e "\n✓ Creating CloudWatch log group: $CORTEX_LOG_GROUP" aws logs create-log-group --log-group-name $CORTEX_LOG_GROUP --region $CORTEX_REGION else - echo -e "\nUsing existing CloudWatch log group: $CORTEX_LOG_GROUP" + echo -e "\n✓ Using existing CloudWatch log group: $CORTEX_LOG_GROUP" fi } -echo "Installing Cortex ... (this will about 20 minutes)" +function setup_configmap() { + kubectl -n=$CORTEX_NAMESPACE create configmap 'cortex-config' \ + --from-literal='LOG_GROUP'=$CORTEX_LOG_GROUP \ + --from-literal='BUCKET'=$CORTEX_BUCKET \ + --from-literal='REGION'=$CORTEX_REGION \ + --from-literal='NAMESPACE'=$CORTEX_NAMESPACE \ + --from-literal='IMAGE_OPERATOR'=$CORTEX_IMAGE_OPERATOR \ + --from-literal='IMAGE_SPARK'=$CORTEX_IMAGE_SPARK \ + --from-literal='IMAGE_TF_TRAIN'=$CORTEX_IMAGE_TF_TRAIN \ + --from-literal='IMAGE_TF_SERVE'=$CORTEX_IMAGE_TF_SERVE \ + --from-literal='IMAGE_TF_API'=$CORTEX_IMAGE_TF_API \ + --from-literal='IMAGE_PYTHON_PACKAGER'=$CORTEX_IMAGE_PYTHON_PACKAGER \ + --from-literal='IMAGE_TF_TRAIN_GPU'=$CORTEX_IMAGE_TF_TRAIN_GPU \ + --from-literal='IMAGE_TF_SERVE_GPU'=$CORTEX_IMAGE_TF_SERVE_GPU \ + --from-literal='ENABLE_TELEMETRY'=$CORTEX_ENABLE_TELEMETRY \ + -o yaml --dry-run | kubectl apply -f - >/dev/null +} + +function setup_secrets() { + kubectl -n=$CORTEX_NAMESPACE create secret generic 'aws-credentials' \ + --from-literal='AWS_ACCESS_KEY_ID'=$AWS_ACCESS_KEY_ID \ + --from-literal='AWS_SECRET_ACCESS_KEY'=$AWS_SECRET_ACCESS_KEY \ + -o yaml --dry-run | kubectl apply -f - >/dev/null +} +echo -e "\nSpinning up a cluster ... (this will about 15 minutes)" + +echo eksctl create cluster --name=$CORTEX_CLUSTER --asg-access --node-type=$CORTEX_NODE_TYPE --nodes-min=$CORTEX_NODES_MIN --nodes-max=$CORTEX_NODES_MAX +echo -e "\n✓ Spun up a cluster" + setup_bucket setup_cloudwatch_logs + +setup_configmap +setup_secrets diff --git a/manager/install_cortex.sh b/manager/install_cortex.sh index 05742f4bb7..b030c2acfc 100755 --- a/manager/install_cortex.sh +++ b/manager/install_cortex.sh @@ -16,35 +16,10 @@ set -e -function setup_configmap() { - kubectl -n=$CORTEX_NAMESPACE create configmap 'cortex-config' \ - --from-literal='LOG_GROUP'=$CORTEX_LOG_GROUP \ - --from-literal='BUCKET'=$CORTEX_BUCKET \ - --from-literal='REGION'=$CORTEX_REGION \ - --from-literal='NAMESPACE'=$CORTEX_NAMESPACE \ - --from-literal='IMAGE_OPERATOR'=$CORTEX_IMAGE_OPERATOR \ - --from-literal='IMAGE_SPARK'=$CORTEX_IMAGE_SPARK \ - --from-literal='IMAGE_TF_TRAIN'=$CORTEX_IMAGE_TF_TRAIN \ - --from-literal='IMAGE_TF_SERVE'=$CORTEX_IMAGE_TF_SERVE \ - --from-literal='IMAGE_TF_API'=$CORTEX_IMAGE_TF_API \ - --from-literal='IMAGE_PYTHON_PACKAGER'=$CORTEX_IMAGE_PYTHON_PACKAGER \ - --from-literal='IMAGE_TF_TRAIN_GPU'=$CORTEX_IMAGE_TF_TRAIN_GPU \ - --from-literal='IMAGE_TF_SERVE_GPU'=$CORTEX_IMAGE_TF_SERVE_GPU \ - --from-literal='ENABLE_TELEMETRY'=$CORTEX_ENABLE_TELEMETRY \ - -o yaml --dry-run | kubectl apply -f - >/dev/null -} - -function setup_secrets() { - kubectl -n=$CORTEX_NAMESPACE create secret generic 'aws-credentials' \ - --from-literal='AWS_ACCESS_KEY_ID'=$AWS_ACCESS_KEY_ID \ - --from-literal='AWS_SECRET_ACCESS_KEY'=$AWS_SECRET_ACCESS_KEY \ - -o yaml --dry-run | kubectl apply -f - >/dev/null -} - function validate_cortex() { set +e - echo -en "\nWaiting for the Cortex operator to be ready " + echo -en "\nWaiting for Cortex to be ready " operator_load_balancer="waiting" api_load_balancer="waiting" @@ -109,24 +84,22 @@ function validate_cortex() { continue fi - echo " ✓" break done - echo -e "\nCortex is ready!" + echo -e "\n\n✓ Cortex is ready!" if command -v cortex >/dev/null; then echo -e "\nPlease run \`cortex configure\` to make sure your CLI is configured correctly" fi } -eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER >/dev/null 2>&1 +echo +eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER -envsubst < manifests/namespace.yaml | kubectl apply -f - >/dev/null - -setup_configmap -setup_secrets +echo -e "\nInstalling Cortex ..." +envsubst < manifests/namespace.yaml | kubectl apply -f - >/dev/null envsubst < manifests/spark.yaml | kubectl apply -f - >/dev/null envsubst < manifests/argo.yaml | kubectl apply -f - >/dev/null envsubst < manifests/nginx.yaml | kubectl apply -f - >/dev/null diff --git a/manager/uninstall_aws.sh b/manager/uninstall_aws.sh old mode 100644 new mode 100755 index 7256e6f1d5..5ab52d13c4 --- a/manager/uninstall_aws.sh +++ b/manager/uninstall_aws.sh @@ -14,8 +14,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -echo -e "\nSpinning down cluster ... (this will take a few minutes)" +echo -e "\nSpinning down the cluster ... (this will take a few minutes)" +echo eksctl delete cluster --name=$CORTEX_CLUSTER -echo -e "\nSpun down cluster" +echo -e "\n✓ Spun down the cluster" diff --git a/manager/uninstall_cortex.sh b/manager/uninstall_cortex.sh index 901b0454de..692c7ceaea 100755 --- a/manager/uninstall_cortex.sh +++ b/manager/uninstall_cortex.sh @@ -16,7 +16,8 @@ set -e -eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER >/dev/null 2>&1 +echo +eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER echo -e "\nUninstalling Cortex ..." @@ -33,4 +34,4 @@ kubectl delete --ignore-not-found=true customresourcedefinition sparkapplication kubectl delete --ignore-not-found=true customresourcedefinition workflows.argoproj.io >/dev/null 2>&1 kubectl delete --ignore-not-found=true namespace $CORTEX_NAMESPACE >/dev/null 2>&1 -echo -e "\nUninstalled Cortex" +echo "✓ Uninstalled Cortex" diff --git a/manager/uninstall_operator.sh b/manager/uninstall_operator.sh index 0e2585d739..f7719039dd 100755 --- a/manager/uninstall_operator.sh +++ b/manager/uninstall_operator.sh @@ -17,7 +17,13 @@ set -e # Note: if namespace is changed, the old namespace will not be deleted -eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER >/dev/null 2>&1 -kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true deployment operator -kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset fluentd # Pods in DaemonSets cannot be modified +echo +eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER + +echo -e "\nUninstalling the Cortex operator ..." + +kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true deployment operator >/dev/null 2>&1 +kubectl -n=$CORTEX_NAMESPACE delete --ignore-not-found=true daemonset fluentd >/dev/null 2>&1 # Pods in DaemonSets cannot be modified + +echo "✓ Uninstalled the Cortex operator" From 5142ca376d7539909de9b6bb21fd4dc472f28143 Mon Sep 17 00:00:00 2001 From: Omer Spillinger Date: Mon, 1 Jul 2019 17:07:56 -0700 Subject: [PATCH 6/6] Improve bucket configuration --- cortex.sh | 19 ++--- images/manager/Dockerfile | 2 +- manager/install_aws.sh | 81 ------------------- manager/install_cortex.sh | 63 +++++++++++++++ manager/install_eks.sh | 24 ++++++ .../{uninstall_aws.sh => uninstall_eks.sh} | 2 + 6 files changed, 97 insertions(+), 94 deletions(-) delete mode 100755 manager/install_aws.sh create mode 100755 manager/install_eks.sh rename manager/{uninstall_aws.sh => uninstall_eks.sh} (98%) diff --git a/cortex.sh b/cortex.sh index d5b217ef23..82994db88f 100755 --- a/cortex.sh +++ b/cortex.sh @@ -91,13 +91,11 @@ set -u export CORTEX_VERSION_STABLE=master # Defaults -random_id=$(cat /dev/urandom | LC_CTYPE=C tr -dc 'a-z0-9' | fold -w 12 | head -n 1) - export AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID:-""}" export AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY:-""}" export CORTEX_LOG_GROUP="${CORTEX_LOG_GROUP:-cortex}" -export CORTEX_BUCKET="${CORTEX_BUCKET:-cortex-$random_id}" +export CORTEX_BUCKET="${CORTEX_BUCKET:-""}" export CORTEX_REGION="${CORTEX_REGION:-us-west-2}" export CORTEX_CLUSTER="${CORTEX_CLUSTER:-cortex}" @@ -127,22 +125,19 @@ export CORTEX_ENABLE_TELEMETRY="${CORTEX_ENABLE_TELEMETRY:-""}" ### TOP-LEVEL COMMANDS ### ########################## -function install_aws() { - docker run --entrypoint /root/install_aws.sh \ +function install_eks() { + docker run --entrypoint /root/install_eks.sh \ -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ -e CORTEX_CLUSTER=$CORTEX_CLUSTER \ -e CORTEX_NODE_TYPE=$CORTEX_NODE_TYPE \ -e CORTEX_NODES_MIN=$CORTEX_NODES_MIN \ -e CORTEX_NODES_MAX=$CORTEX_NODES_MAX \ - -e CORTEX_LOG_GROUP=$CORTEX_LOG_GROUP \ - -e CORTEX_BUCKET=$CORTEX_BUCKET \ - -e CORTEX_REGION=$CORTEX_REGION \ cortexlabs/manager } -function uninstall_aws() { - docker run --entrypoint /root/uninstall_aws.sh \ +function uninstall_eks() { + docker run --entrypoint /root/uninstall_eks.sh \ -e AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID \ -e AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY \ -e CORTEX_CLUSTER=$CORTEX_CLUSTER \ @@ -405,7 +400,7 @@ if [ "$arg1" = "install" ]; then show_help exit 1 elif [ "$arg2" = "" ]; then - prompt_for_telemetry && install_aws && install_cortex && info + prompt_for_telemetry && install_eks && install_cortex && info elif [ "$arg2" = "cli" ]; then install_cli elif [ "$arg2" = "" ]; then @@ -423,7 +418,7 @@ elif [ "$arg1" = "uninstall" ]; then show_help exit 1 elif [ "$arg2" = "" ]; then - uninstall_cortex && uninstall_aws + uninstall_cortex && uninstall_eks elif [ "$arg2" = "cli" ]; then uninstall_cli elif [ "$arg2" = "" ]; then diff --git a/images/manager/Dockerfile b/images/manager/Dockerfile index 37733fca5c..9eed3cc4a4 100644 --- a/images/manager/Dockerfile +++ b/images/manager/Dockerfile @@ -7,7 +7,7 @@ ENV PATH /root/.local/bin:$PATH RUN pip3 install awscli --upgrade --user && \ rm -rf /root/.cache/pip* -RUN apk add --no-cache bash curl gettext +RUN apk add --no-cache bash curl gettext jq RUN curl --location "https://github.com/weaveworks/eksctl/releases/download/0.1.38/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp && \ mv /tmp/eksctl /usr/local/bin diff --git a/manager/install_aws.sh b/manager/install_aws.sh deleted file mode 100755 index d18230012e..0000000000 --- a/manager/install_aws.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Cortex Labs, Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e - -function setup_bucket() { - if ! aws s3api head-bucket --bucket $CORTEX_BUCKET --output json 2>/dev/null; then - if aws s3 ls "s3://$CORTEX_BUCKET" --output json 2>&1 | grep -q 'NoSuchBucket'; then - echo -e "\n✓ Creating S3 bucket: $CORTEX_BUCKET" - aws s3api create-bucket --bucket $CORTEX_BUCKET \ - --region $CORTEX_REGION \ - --create-bucket-configuration LocationConstraint=$CORTEX_REGION \ - >/dev/null - else - echo -e "\nA bucket named \"${CORTEX_BUCKET}\" already exists, but you do not have access to it" - exit 1 - fi - else - echo -e "\n✓ Using existing S3 bucket: $CORTEX_BUCKET" - fi -} - -function setup_cloudwatch_logs() { - if ! aws logs list-tags-log-group --log-group-name $CORTEX_LOG_GROUP --region $CORTEX_REGION --output json 2>&1 | grep -q "\"tags\":"; then - echo -e "\n✓ Creating CloudWatch log group: $CORTEX_LOG_GROUP" - aws logs create-log-group --log-group-name $CORTEX_LOG_GROUP --region $CORTEX_REGION - else - echo -e "\n✓ Using existing CloudWatch log group: $CORTEX_LOG_GROUP" - fi -} - -function setup_configmap() { - kubectl -n=$CORTEX_NAMESPACE create configmap 'cortex-config' \ - --from-literal='LOG_GROUP'=$CORTEX_LOG_GROUP \ - --from-literal='BUCKET'=$CORTEX_BUCKET \ - --from-literal='REGION'=$CORTEX_REGION \ - --from-literal='NAMESPACE'=$CORTEX_NAMESPACE \ - --from-literal='IMAGE_OPERATOR'=$CORTEX_IMAGE_OPERATOR \ - --from-literal='IMAGE_SPARK'=$CORTEX_IMAGE_SPARK \ - --from-literal='IMAGE_TF_TRAIN'=$CORTEX_IMAGE_TF_TRAIN \ - --from-literal='IMAGE_TF_SERVE'=$CORTEX_IMAGE_TF_SERVE \ - --from-literal='IMAGE_TF_API'=$CORTEX_IMAGE_TF_API \ - --from-literal='IMAGE_PYTHON_PACKAGER'=$CORTEX_IMAGE_PYTHON_PACKAGER \ - --from-literal='IMAGE_TF_TRAIN_GPU'=$CORTEX_IMAGE_TF_TRAIN_GPU \ - --from-literal='IMAGE_TF_SERVE_GPU'=$CORTEX_IMAGE_TF_SERVE_GPU \ - --from-literal='ENABLE_TELEMETRY'=$CORTEX_ENABLE_TELEMETRY \ - -o yaml --dry-run | kubectl apply -f - >/dev/null -} - -function setup_secrets() { - kubectl -n=$CORTEX_NAMESPACE create secret generic 'aws-credentials' \ - --from-literal='AWS_ACCESS_KEY_ID'=$AWS_ACCESS_KEY_ID \ - --from-literal='AWS_SECRET_ACCESS_KEY'=$AWS_SECRET_ACCESS_KEY \ - -o yaml --dry-run | kubectl apply -f - >/dev/null -} - -echo -e "\nSpinning up a cluster ... (this will about 15 minutes)" - -echo -eksctl create cluster --name=$CORTEX_CLUSTER --asg-access --node-type=$CORTEX_NODE_TYPE --nodes-min=$CORTEX_NODES_MIN --nodes-max=$CORTEX_NODES_MAX - -echo -e "\n✓ Spun up a cluster" - -setup_bucket -setup_cloudwatch_logs - -setup_configmap -setup_secrets diff --git a/manager/install_cortex.sh b/manager/install_cortex.sh index b030c2acfc..54087460dd 100755 --- a/manager/install_cortex.sh +++ b/manager/install_cortex.sh @@ -16,6 +16,62 @@ set -e +function setup_bucket() { + if [ "$CORTEX_BUCKET" == "" ]; then + account_id_hash=$(aws sts get-caller-identity | jq .Account | sha256sum | cut -f1 -d" " | cut -c -10) + CORTEX_BUCKET="cortex-${account_id_hash}" + fi + + if ! aws s3api head-bucket --bucket $CORTEX_BUCKET --output json 2>/dev/null; then + if aws s3 ls "s3://$CORTEX_BUCKET" --output json 2>&1 | grep -q 'NoSuchBucket'; then + echo -e "\n✓ Creating an S3 bucket: $CORTEX_BUCKET" + aws s3api create-bucket --bucket $CORTEX_BUCKET \ + --region $CORTEX_REGION \ + --create-bucket-configuration LocationConstraint=$CORTEX_REGION \ + >/dev/null + else + echo -e "\nA bucket named \"${CORTEX_BUCKET}\" already exists, but you do not have access to it" + exit 1 + fi + else + echo -e "\n✓ Using an existing S3 bucket: $CORTEX_BUCKET" + fi +} + +function setup_cloudwatch_logs() { + if ! aws logs list-tags-log-group --log-group-name $CORTEX_LOG_GROUP --region $CORTEX_REGION --output json 2>&1 | grep -q "\"tags\":"; then + echo -e "\n✓ Creating a CloudWatch log group: $CORTEX_LOG_GROUP" + aws logs create-log-group --log-group-name $CORTEX_LOG_GROUP --region $CORTEX_REGION + else + echo -e "\n✓ Using an existing CloudWatch log group: $CORTEX_LOG_GROUP" + fi +} + +function setup_configmap() { + kubectl -n=$CORTEX_NAMESPACE create configmap 'cortex-config' \ + --from-literal='LOG_GROUP'=$CORTEX_LOG_GROUP \ + --from-literal='BUCKET'=$CORTEX_BUCKET \ + --from-literal='REGION'=$CORTEX_REGION \ + --from-literal='NAMESPACE'=$CORTEX_NAMESPACE \ + --from-literal='IMAGE_OPERATOR'=$CORTEX_IMAGE_OPERATOR \ + --from-literal='IMAGE_SPARK'=$CORTEX_IMAGE_SPARK \ + --from-literal='IMAGE_TF_TRAIN'=$CORTEX_IMAGE_TF_TRAIN \ + --from-literal='IMAGE_TF_SERVE'=$CORTEX_IMAGE_TF_SERVE \ + --from-literal='IMAGE_TF_API'=$CORTEX_IMAGE_TF_API \ + --from-literal='IMAGE_PYTHON_PACKAGER'=$CORTEX_IMAGE_PYTHON_PACKAGER \ + --from-literal='IMAGE_TF_TRAIN_GPU'=$CORTEX_IMAGE_TF_TRAIN_GPU \ + --from-literal='IMAGE_TF_SERVE_GPU'=$CORTEX_IMAGE_TF_SERVE_GPU \ + --from-literal='ENABLE_TELEMETRY'=$CORTEX_ENABLE_TELEMETRY \ + -o yaml --dry-run | kubectl apply -f - >/dev/null +} + +function setup_secrets() { + kubectl -n=$CORTEX_NAMESPACE create secret generic 'aws-credentials' \ + --from-literal='AWS_ACCESS_KEY_ID'=$AWS_ACCESS_KEY_ID \ + --from-literal='AWS_SECRET_ACCESS_KEY'=$AWS_SECRET_ACCESS_KEY \ + -o yaml --dry-run | kubectl apply -f - >/dev/null +} + function validate_cortex() { set +e @@ -99,7 +155,14 @@ eksctl utils write-kubeconfig --name=$CORTEX_CLUSTER echo -e "\nInstalling Cortex ..." +setup_bucket +setup_cloudwatch_logs + envsubst < manifests/namespace.yaml | kubectl apply -f - >/dev/null + +setup_configmap +setup_secrets + envsubst < manifests/spark.yaml | kubectl apply -f - >/dev/null envsubst < manifests/argo.yaml | kubectl apply -f - >/dev/null envsubst < manifests/nginx.yaml | kubectl apply -f - >/dev/null diff --git a/manager/install_eks.sh b/manager/install_eks.sh new file mode 100755 index 0000000000..23e797e51f --- /dev/null +++ b/manager/install_eks.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +# Copyright 2019 Cortex Labs, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +echo -e "\nSpinning up the cluster ... (this will about 15 minutes)" + +echo +eksctl create cluster --name=$CORTEX_CLUSTER --asg-access --node-type=$CORTEX_NODE_TYPE --nodes-min=$CORTEX_NODES_MIN --nodes-max=$CORTEX_NODES_MAX + +echo -e "\n✓ Spun up the cluster" diff --git a/manager/uninstall_aws.sh b/manager/uninstall_eks.sh similarity index 98% rename from manager/uninstall_aws.sh rename to manager/uninstall_eks.sh index 5ab52d13c4..c61369e61c 100755 --- a/manager/uninstall_aws.sh +++ b/manager/uninstall_eks.sh @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +set -e + echo -e "\nSpinning down the cluster ... (this will take a few minutes)" echo