diff --git a/.travis.yml b/.travis.yml index f0c9aa80df..920326c4bf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,7 @@ services: script: - docker build . -f images/spark-base/Dockerfile -t cortexlabs/spark-base:latest - docker build . -f images/tf-base/Dockerfile -t cortexlabs/tf-base:latest + - docker build . -f images/tf-base-gpu/Dockerfile -t cortexlabs/tf-base-gpu:latest - ./build/images.sh images/operator operator - ./build/images.sh images/spark spark @@ -20,6 +21,8 @@ script: - ./build/images.sh images/tf-train tf-train - ./build/images.sh images/tf-serve tf-serve - ./build/images.sh images/tf-api tf-api + - ./build/images.sh images/tf-serve-gpu tf-serve-gpu + - ./build/images.sh images/tf-train-gpu tf-train-gpu - ./build/images.sh images/nginx-controller nginx-controller - ./build/images.sh images/nginx-backend nginx-backend - ./build/images.sh images/fluentd fluentd diff --git a/cortex.sh b/cortex.sh index 823aefeaa4..46dd19f169 100755 --- a/cortex.sh +++ b/cortex.sh @@ -147,6 +147,8 @@ export CORTEX_IMAGE_TF_SERVE="${CORTEX_IMAGE_TF_SERVE:-cortexlabs/tf-serve:$CORT export CORTEX_IMAGE_TF_TRAIN="${CORTEX_IMAGE_TF_TRAIN:-cortexlabs/tf-train:$CORTEX_VERSION_STABLE}" export CORTEX_IMAGE_TF_API="${CORTEX_IMAGE_TF_API:-cortexlabs/tf-api:$CORTEX_VERSION_STABLE}" export CORTEX_IMAGE_PYTHON_PACKAGER="${CORTEX_IMAGE_PYTHON_PACKAGER:-cortexlabs/python-packager:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_TF_SERVE_GPU="${CORTEX_IMAGE_TF_SERVE_GPU:-cortexlabs/tf-serve-gpu:$CORTEX_VERSION_STABLE}" +export CORTEX_IMAGE_TF_TRAIN_GPU="${CORTEX_IMAGE_TF_TRAIN_GPU:-cortexlabs/tf-train-gpu:$CORTEX_VERSION_STABLE}" export AWS_ACCESS_KEY_ID="${AWS_ACCESS_KEY_ID:-""}" export AWS_SECRET_ACCESS_KEY="${AWS_SECRET_ACCESS_KEY:-""}" @@ -291,6 +293,8 @@ function setup_configmap() { --from-literal='IMAGE_TF_SERVE'=$CORTEX_IMAGE_TF_SERVE \ --from-literal='IMAGE_TF_API'=$CORTEX_IMAGE_TF_API \ --from-literal='IMAGE_PYTHON_PACKAGER'=$CORTEX_IMAGE_PYTHON_PACKAGER \ + --from-literal='IMAGE_TF_TRAIN_GPU'=$CORTEX_IMAGE_TF_TRAIN_GPU \ + --from-literal='IMAGE_TF_SERVE_GPU'=$CORTEX_IMAGE_TF_SERVE_GPU \ -o yaml --dry-run | kubectl apply -f - >/dev/null } diff --git a/dev/eks.sh b/dev/eks.sh index d8aee8becd..eea9352ca1 100755 --- a/dev/eks.sh +++ b/dev/eks.sh @@ -26,7 +26,12 @@ function eks_set_cluster() { } if [ "$1" = "start" ]; then - eksctl create cluster --version=1.11 --name=$K8S_NAME --region $K8S_REGION --nodes=$K8S_NODE_COUNT --node-type=$K8S_NODE_INSTANCE_TYPE + eksctl create cluster --version=1.11 --name=$K8S_NAME --region $K8S_REGION --nodes-max $K8S_NODES_MAX_COUNT --nodes-min $K8S_NODES_MIN_COUNT --node-type=$K8S_NODE_INSTANCE_TYPE + if [ $K8S_GPU_NODES_MIN_COUNT -gt 0 ] || [ $K8S_GPU_NODES_MAX_COUNT -gt 0 ]; then + eksctl create nodegroup --version=1.11 --cluster=$K8S_NAME --nodes-max=$K8S_GPU_NODES_MAX_COUNT --nodes-min=$K8S_GPU_NODES_MIN_COUNT --node-type=$K8S_GPU_NODE_INSTANCE_TYPE --node-ami=$K8S_GPU_NODE_AMI + echo "Once the GPU nodegroup joins the cluster, run:" + echo "kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v1.11/nvidia-device-plugin.yml" + fi eks_set_cluster elif [ "$1" = "update" ]; then diff --git a/dev/kops.sh b/dev/kops.sh index d92ebe012a..44e6eeda9d 100755 --- a/dev/kops.sh +++ b/dev/kops.sh @@ -131,8 +131,8 @@ spec: image: kope.io/k8s-1.11-debian-stretch-amd64-hvm-ebs-2018-08-17 machineType: ${K8S_NODE_INSTANCE_TYPE} rootVolumeSize: ${K8S_NODE_VOLUME_SIZE} - maxSize: ${K8S_NODE_COUNT} - minSize: ${K8S_NODE_COUNT} + maxSize: ${K8S_NODES_MAX_COUNT} + minSize: ${K8S_NODES_MIN_COUNT} nodeLabels: kops.k8s.io/instancegroup: nodes role: Node diff --git a/dev/registry.sh b/dev/registry.sh index e2a853e47a..468ee09d67 100755 --- a/dev/registry.sh +++ b/dev/registry.sh @@ -47,6 +47,8 @@ function create_registry() { aws ecr create-repository --repository-name=cortexlabs/tf-train --region=$REGISTRY_REGION || true aws ecr create-repository --repository-name=cortexlabs/tf-api --region=$REGISTRY_REGION || true aws ecr create-repository --repository-name=cortexlabs/python-packager --region=$REGISTRY_REGION || true + aws ecr create-repository --repository-name=cortexlabs/tf-train-gpu --region=$REGISTRY_REGION || true + aws ecr create-repository --repository-name=cortexlabs/tf-serve-gpu --region=$REGISTRY_REGION || true } ### HELPERS ### @@ -115,6 +117,7 @@ elif [ "$cmd" = "update" ]; then cache_builder $ROOT/images/spark-base spark-base build_base $ROOT/images/spark-base spark-base build_base $ROOT/images/tf-base tf-base + build_base $ROOT/images/tf-base-gpu tf-base-gpu cache_builder $ROOT/images/operator operator build_and_push $ROOT/images/operator operator latest @@ -128,11 +131,13 @@ elif [ "$cmd" = "update" ]; then build_and_push $ROOT/images/argo-controller argo-controller latest build_and_push $ROOT/images/argo-executor argo-executor latest build_and_push $ROOT/images/tf-serve tf-serve latest + build_and_push $ROOT/images/tf-serve-gpu tf-serve-gpu latest build_and_push $ROOT/images/python-packager python-packager latest fi build_and_push $ROOT/images/spark spark latest build_and_push $ROOT/images/tf-train tf-train latest + build_and_push $ROOT/images/tf-train-gpu tf-train-gpu latest build_and_push $ROOT/images/tf-api tf-api latest cleanup diff --git a/docs/applications/advanced/compute.md b/docs/applications/advanced/compute.md index f25f840c01..56029af150 100644 --- a/docs/applications/advanced/compute.md +++ b/docs/applications/advanced/compute.md @@ -10,6 +10,7 @@ For example: compute: cpu: "2" mem: "1Gi" + gpu: 1 ``` CPU and memory requests in Cortex correspond to compute resource requests in Kubernetes. In the example above, the training job will only be scheduled once 2 CPUs and 1Gi of memory are available, and the job will be guaranteed to have access to those resources throughout it's execution. In some cases, a Cortex compute resource request can be (or may default to) `Null`. @@ -21,3 +22,9 @@ One unit of CPU corresponds to one virtual CPU on AWS. Fractional requests are a ## Memory One unit of memory is one byte. Memory can be expressed as an integer or by using one of these suffixes: `K`, `M`, `G`, `T` (or their power-of two counterparts: `Ki`, `Mi`, `Gi`, `Ti`). For example, the following values represent roughly the same memory: `128974848`, `129e6`, `129M`, `123Mi`. + +## GPU +One unit of GPU corresponds to one virtual GPU on AWS. Fractional requests are not allowed. Here's some information on [adding GPU enabled nodes on EKS](https://docs.aws.amazon.com/en_ca/eks/latest/userguide/gpu-ami.html). + +## GPU Support +We recommend using GPU compute requests on API resources only if you have enough nodes in your cluster to support the number of GPU requests in model training plus APIs (ideally with an autoscaler). Otherwise, due to the nature of zero downtime rolling updates, your model training will not have sufficient GPU resources as there will always be GPUs consumed by APIs from the previous deployment. diff --git a/docs/applications/resources/apis.md b/docs/applications/resources/apis.md index aca008f1a1..aa2fb07183 100644 --- a/docs/applications/resources/apis.md +++ b/docs/applications/resources/apis.md @@ -12,6 +12,7 @@ Serve models at scale and use them to build smarter applications. replicas: # number of replicas to launch (default: 1) cpu: # CPU request (default: Null) mem: # memory request (default: Null) + gpu: # gpu request (default: Null) tags: : # arbitrary key/value pairs to attach to the resource (optional) ... diff --git a/docs/applications/resources/models.md b/docs/applications/resources/models.md index 1d8b5c47bb..9f7773b9a3 100644 --- a/docs/applications/resources/models.md +++ b/docs/applications/resources/models.md @@ -44,6 +44,7 @@ Train custom TensorFlow models at scale. compute: cpu: # CPU request (default: Null) mem: # memory request (default: Null) + gpu: # GPU request (default: Null) tags: : # arbitrary key/value pairs to attach to the resource (optional) diff --git a/docs/applications/resources/statuses.md b/docs/applications/resources/statuses.md index 3c6c02be63..49b1fa65c6 100644 --- a/docs/applications/resources/statuses.md +++ b/docs/applications/resources/statuses.md @@ -12,7 +12,7 @@ | terminated | Resource was terminated | | upstream error | Resource was not created due to an error in one of its dependencies | | upstream termination | Resource was not created because one of its dependencies was terminated | -| compute unavailable | Resource's workload could not start due to insufficient memory or CPU in the cluster | +| compute unavailable | Resource's workload could not start due to insufficient memory, CPU or GPU in the cluster | ## API statuses @@ -29,4 +29,4 @@ | update skipped | API was not updated due to an error in another resource; a previous version of this API is ready | | upstream error | API was not created due to an error in one of its dependencies; a previous version of this API may be ready | | upstream termination | API was not created because one of its dependencies was terminated; a previous version of this API may be ready | -| compute unavailable | API could not start due to insufficient memory or CPU in the cluster; some replicas may be ready | +| compute unavailable | API could not start due to insufficient memory, CPU or GPU in the cluster; some replicas may be ready | diff --git a/images/python-packager/Dockerfile b/images/python-packager/Dockerfile index 6ae534717a..4122a81d71 100644 --- a/images/python-packager/Dockerfile +++ b/images/python-packager/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:18.04 +FROM ubuntu:16.04 RUN apt-get update -qq && apt-get install -y -q \ python3 \ diff --git a/images/spark-base/Dockerfile b/images/spark-base/Dockerfile index eebd98e9c2..352702b771 100644 --- a/images/spark-base/Dockerfile +++ b/images/spark-base/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:18.04 as builder +FROM ubuntu:16.04 as builder RUN apt-get update -qq && apt-get install -y -q \ git \ @@ -47,7 +47,7 @@ RUN wget -q -P $SPARK_HOME/jars/ http://central.maven.org/maven2/com/amazonaws/a COPY images/spark-base/conf/* $SPARK_HOME/conf/ -FROM ubuntu:18.04 +FROM ubuntu:16.04 ENV JAVA_HOME="/usr/lib/jvm/java-8-openjdk-amd64" ENV HADOOP_HOME="/opt/hadoop" diff --git a/images/tf-base-gpu/Dockerfile b/images/tf-base-gpu/Dockerfile new file mode 100644 index 0000000000..d696be771a --- /dev/null +++ b/images/tf-base-gpu/Dockerfile @@ -0,0 +1,5 @@ +FROM tensorflow/tensorflow:1.12.0-gpu-py3 + +RUN apt-get update -qq && apt-get install -y -q \ + zlib1g-dev \ + && apt-get clean -qq && rm -rf /var/lib/apt/lists/* diff --git a/images/tf-base/Dockerfile b/images/tf-base/Dockerfile index 359b3514f2..4904c1ad82 100644 --- a/images/tf-base/Dockerfile +++ b/images/tf-base/Dockerfile @@ -1,21 +1,6 @@ -FROM ubuntu:18.04 - -ARG TF_VERSION="1.12.0" +FROM tensorflow/tensorflow:1.12.0-py3 RUN apt-get update -qq && apt-get install -y -q \ - build-essential \ - curl \ - libfreetype6-dev \ - libpng-dev \ - libzmq3-dev \ - pkg-config \ - python3 \ - python3-dev \ - python3-pip \ - rsync \ - software-properties-common \ - unzip \ zlib1g-dev \ && apt-get clean -qq && rm -rf /var/lib/apt/lists/* -RUN pip3 install https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-${TF_VERSION}-cp36-cp36m-linux_x86_64.whl && rm -rf /root/.cache/pip* diff --git a/images/tf-serve-gpu/Dockerfile b/images/tf-serve-gpu/Dockerfile new file mode 100644 index 0000000000..de25316a76 --- /dev/null +++ b/images/tf-serve-gpu/Dockerfile @@ -0,0 +1,8 @@ +FROM cortexlabs/tf-base-gpu + +ARG TF_VERSION="1.12.0" + +RUN curl -o tensorflow-model-server.deb http://storage.googleapis.com/tensorflow-serving-apt/pool/tensorflow-model-server-${TF_VERSION}/t/tensorflow-model-server/tensorflow-model-server_${TF_VERSION}_all.deb +RUN dpkg -i tensorflow-model-server.deb + +ENTRYPOINT ["tensorflow_model_server"] diff --git a/images/tf-serve/Dockerfile b/images/tf-serve/Dockerfile index a6e160cd50..7eedaaad5b 100644 --- a/images/tf-serve/Dockerfile +++ b/images/tf-serve/Dockerfile @@ -1,25 +1,9 @@ -FROM ubuntu:18.04 +FROM cortexlabs/tf-base ARG TF_VERSION="1.12.0" RUN apt-get update -qq && apt-get install -y -q \ - automake \ - build-essential \ curl \ - libcurl3-dev \ - git \ - libtool \ - libfreetype6-dev \ - libpng-dev \ - libzmq3-dev \ - pkg-config \ - python3-dev \ - python3-numpy \ - python3-pip \ - software-properties-common \ - swig \ - zip \ - zlib1g-dev \ && apt-get clean -qq && rm -rf /var/lib/apt/lists/* RUN curl -o tensorflow-model-server.deb http://storage.googleapis.com/tensorflow-serving-apt/pool/tensorflow-model-server-${TF_VERSION}/t/tensorflow-model-server/tensorflow-model-server_${TF_VERSION}_all.deb diff --git a/images/tf-train-gpu/Dockerfile b/images/tf-train-gpu/Dockerfile new file mode 100644 index 0000000000..3dd7f65457 --- /dev/null +++ b/images/tf-train-gpu/Dockerfile @@ -0,0 +1,15 @@ +FROM cortexlabs/tf-base-gpu + +ENV PYTHONPATH="/src:${PYTHONPATH}" + +COPY pkg/workloads/lib/requirements.txt /src/lib/requirements.txt +COPY pkg/workloads/tf_train/requirements.txt /src/tf_train/requirements.txt +RUN pip3 install -r /src/lib/requirements.txt && \ + pip3 install -r /src/tf_train/requirements.txt && \ + rm -rf /root/.cache/pip* + +COPY pkg/workloads/consts.py /src/ +COPY pkg/workloads/lib /src/lib +COPY pkg/workloads/tf_train /src/tf_train + +ENTRYPOINT ["/usr/bin/python3", "/src/tf_train/train.py"] diff --git a/pkg/api/userconfig/compute.go b/pkg/api/userconfig/compute.go index a27da3a440..52ff73af13 100644 --- a/pkg/api/userconfig/compute.go +++ b/pkg/api/userconfig/compute.go @@ -142,6 +142,7 @@ func (sparkCompute *SparkCompute) ID() string { type TFCompute struct { CPU *Quantity `json:"cpu" yaml:"cpu"` Mem *Quantity `json:"mem" yaml:"mem"` + GPU *int64 `json:"gpu" yaml:"gpu"` } var tfComputeFieldValidation = &cr.StructFieldValidation{ @@ -166,6 +167,13 @@ var tfComputeFieldValidation = &cr.StructFieldValidation{ Min: k8sresource.MustParse("0"), }), }, + &cr.StructFieldValidation{ + StructField: "GPU", + Int64PtrValidation: &cr.Int64PtrValidation{ + Default: nil, + GreaterThan: util.Int64Ptr(0), + }, + }, }, }, } @@ -181,6 +189,7 @@ type APICompute struct { Replicas int32 `json:"replicas" yaml:"replicas"` CPU *Quantity `json:"cpu" yaml:"cpu"` Mem *Quantity `json:"mem" yaml:"mem"` + GPU int64 `json:"gpu" yaml:"gpu"` } var apiComputeFieldValidation = &cr.StructFieldValidation{ @@ -212,6 +221,13 @@ var apiComputeFieldValidation = &cr.StructFieldValidation{ Min: k8sresource.MustParse("0"), }), }, + &cr.StructFieldValidation{ + StructField: "GPU", + Int64Validation: &cr.Int64Validation{ + Default: 0, + GreaterThanOrEqualTo: util.Int64Ptr(0), + }, + }, }, }, } @@ -221,6 +237,7 @@ func (apiCompute *APICompute) ID() string { buf.WriteString(s.Int32(apiCompute.Replicas)) buf.WriteString(QuantityPtrID(apiCompute.CPU)) buf.WriteString(QuantityPtrID(apiCompute.Mem)) + buf.WriteString(s.Int64(apiCompute.GPU)) return util.HashBytes(buf.Bytes()) } @@ -228,6 +245,7 @@ func (apiCompute *APICompute) IDWithoutReplicas() string { var buf bytes.Buffer buf.WriteString(QuantityPtrID(apiCompute.CPU)) buf.WriteString(QuantityPtrID(apiCompute.Mem)) + buf.WriteString(s.Int64(apiCompute.GPU)) return util.HashBytes(buf.Bytes()) } @@ -284,6 +302,11 @@ func MaxTFCompute(tfComputes ...*TFCompute) *TFCompute { aggregated.Mem = tfCompute.Mem } } + if tfCompute.GPU != nil { + if aggregated.GPU == nil || *tfCompute.GPU > *aggregated.GPU { + aggregated.GPU = tfCompute.GPU + } + } } return &aggregated @@ -299,5 +322,10 @@ func (apiCompute *APICompute) Equal(apiCompute2 APICompute) bool { if !QuantityPtrsEqual(apiCompute.Mem, apiCompute2.Mem) { return false } + + if apiCompute.GPU != apiCompute2.GPU { + return false + } + return true } diff --git a/pkg/operator/cortexconfig/cortex_config.go b/pkg/operator/cortexconfig/cortex_config.go index 05a7276dc8..e423e283bc 100644 --- a/pkg/operator/cortexconfig/cortex_config.go +++ b/pkg/operator/cortexconfig/cortex_config.go @@ -34,6 +34,8 @@ var ( TFServeImage string TFAPIImage string PythonPackagerImage string + TFTrainImageGPU string + TFServeImageGPU string ) func init() { @@ -47,6 +49,8 @@ func init() { TFServeImage = getStr("IMAGE_TF_SERVE") TFAPIImage = getStr("IMAGE_TF_API") PythonPackagerImage = getStr("IMAGE_PYTHON_PACKAGER") + TFTrainImageGPU = getStr("IMAGE_TF_TRAIN_GPU") + TFServeImageGPU = getStr("IMAGE_TF_SERVE_GPU") } // diff --git a/pkg/operator/workloads/api.go b/pkg/operator/workloads/api.go index f4afeb606b..26f66e6284 100644 --- a/pkg/operator/workloads/api.go +++ b/pkg/operator/workloads/api.go @@ -21,6 +21,7 @@ import ( appsv1b1 "k8s.io/api/apps/v1beta1" corev1 "k8s.io/api/core/v1" + k8sresource "k8s.io/apimachinery/pkg/api/resource" "github.com/cortexlabs/cortex/pkg/api/context" s "github.com/cortexlabs/cortex/pkg/api/strings" @@ -47,6 +48,7 @@ func apiSpec( transformResourceList := corev1.ResourceList{} tfServingResourceList := corev1.ResourceList{} + tfServingLimitsList := corev1.ResourceList{} if apiCompute.CPU != nil { q1, q2 := apiCompute.CPU.SplitInTwo() @@ -59,6 +61,13 @@ func apiSpec( tfServingResourceList[corev1.ResourceMemory] = *q2 } + servingImage := cc.TFServeImage + if apiCompute.GPU > 0 { + servingImage = cc.TFServeImageGPU + tfServingResourceList["nvidia.com/gpu"] = *k8sresource.NewQuantity(apiCompute.GPU, k8sresource.DecimalSI) + tfServingLimitsList["nvidia.com/gpu"] = *k8sresource.NewQuantity(apiCompute.GPU, k8sresource.DecimalSI) + } + return k8s.Deployment(&k8s.DeploymentSpec{ Name: internalAPIName(apiName, ctx.App.Name), Replicas: ctx.APIs[apiName].Compute.Replicas, @@ -106,7 +115,7 @@ func apiSpec( }, { Name: tfServingContainerName, - Image: cc.TFServeImage, + Image: servingImage, ImagePullPolicy: "Always", Args: []string{ "--port=" + tfServingPortStr, @@ -116,6 +125,7 @@ func apiSpec( VolumeMounts: k8s.DefaultVolumeMounts(), Resources: corev1.ResourceRequirements{ Requests: tfServingResourceList, + Limits: tfServingLimitsList, }, }, }, @@ -300,18 +310,20 @@ func APIDeploymentCompute(deployment *appsv1b1.Deployment) userconfig.APICompute replicas = *deployment.Spec.Replicas } - cpu, mem := APIPodCompute(deployment.Spec.Template.Spec.Containers) + cpu, mem, gpu := APIPodCompute(deployment.Spec.Template.Spec.Containers) return userconfig.APICompute{ Replicas: replicas, CPU: cpu, Mem: mem, + GPU: gpu, } } -func APIPodCompute(containers []corev1.Container) (*userconfig.Quantity, *userconfig.Quantity) { - var totalCPU *userconfig.Quantity = nil - var totalMem *userconfig.Quantity = nil +func APIPodCompute(containers []corev1.Container) (*userconfig.Quantity, *userconfig.Quantity, int64) { + var totalCPU *userconfig.Quantity + var totalMem *userconfig.Quantity + var totalGPU int64 for _, container := range containers { if container.Name != apiContainerName && container.Name != tfServingContainerName { @@ -335,7 +347,13 @@ func APIPodCompute(containers []corev1.Container) (*userconfig.Quantity, *userco } totalMem.Add(mem) } + if gpu, ok := requests["nvidia.com/gpu"]; ok { + gpuVal, ok := gpu.AsInt64() + if ok { + totalGPU += gpuVal + } + } } - return totalCPU, totalMem + return totalCPU, totalMem, totalGPU } diff --git a/pkg/operator/workloads/api_status.go b/pkg/operator/workloads/api_status.go index a30d804387..ae4bf74389 100644 --- a/pkg/operator/workloads/api_status.go +++ b/pkg/operator/workloads/api_status.go @@ -117,10 +117,11 @@ func getReplicaCountsMap(podList []corev1.Pod, ctx *context.Context) map[string] for _, pod := range podList { resourceID := pod.Labels["resourceID"] - cpu, mem := APIPodCompute(pod.Spec.Containers) + cpu, mem, gpu := APIPodCompute(pod.Spec.Containers) podAPICompute := userconfig.APICompute{ CPU: cpu, Mem: mem, + GPU: gpu, } podAPIComputeID := podAPICompute.IDWithoutReplicas() podStatus := k8s.GetPodStatus(&pod) diff --git a/pkg/operator/workloads/training_job.go b/pkg/operator/workloads/training_job.go index 17b0d7e5c3..a1e2df5b51 100644 --- a/pkg/operator/workloads/training_job.go +++ b/pkg/operator/workloads/training_job.go @@ -19,6 +19,7 @@ package workloads import ( batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + k8sresource "k8s.io/apimachinery/pkg/api/resource" "github.com/cortexlabs/cortex/pkg/api/context" "github.com/cortexlabs/cortex/pkg/api/userconfig" @@ -38,6 +39,7 @@ func trainingJobSpec( ) *batchv1.Job { resourceList := corev1.ResourceList{} + limitsList := corev1.ResourceList{} if tfCompute.CPU != nil { resourceList[corev1.ResourceCPU] = tfCompute.CPU.Quantity } @@ -45,6 +47,13 @@ func trainingJobSpec( resourceList[corev1.ResourceMemory] = tfCompute.Mem.Quantity } + trainImage := cc.TFTrainImage + if tfCompute.GPU != nil { + trainImage = cc.TFTrainImageGPU + resourceList["nvidia.com/gpu"] = *k8sresource.NewQuantity(*tfCompute.GPU, k8sresource.DecimalSI) + limitsList["nvidia.com/gpu"] = *k8sresource.NewQuantity(*tfCompute.GPU, k8sresource.DecimalSI) + } + spec := k8s.Job(&k8s.JobSpec{ Name: workloadID, Labels: map[string]string{ @@ -64,7 +73,7 @@ func trainingJobSpec( Containers: []corev1.Container{ { Name: "train", - Image: cc.TFTrainImage, + Image: trainImage, ImagePullPolicy: "Always", Args: []string{ "--workload-id=" + workloadID, @@ -76,6 +85,7 @@ func trainingJobSpec( VolumeMounts: k8s.DefaultVolumeMounts(), Resources: corev1.ResourceRequirements{ Requests: resourceList, + Limits: limitsList, }, }, }, diff --git a/pkg/workloads/lib/aws.py b/pkg/workloads/lib/aws.py index 69c5102b88..7a6f5f0d24 100644 --- a/pkg/workloads/lib/aws.py +++ b/pkg/workloads/lib/aws.py @@ -151,7 +151,7 @@ def upload_json_to_s3(obj, key, bucket, client_config={}): def read_json_from_s3(key, bucket, allow_missing=True, client_config={}): - obj = read_bytes_from_s3(key, bucket, allow_missing, client_config) + obj = read_bytes_from_s3(key, bucket, allow_missing, client_config).decode("utf-8") if obj is None: return None return json.loads(obj)