diff --git a/build/images.sh b/build/images.sh index 20b36d18c9..adf1caf142 100644 --- a/build/images.sh +++ b/build/images.sh @@ -64,6 +64,7 @@ dev_images_gcp=( non_dev_images_cluster=( "tensorflow-serving-cpu" "tensorflow-serving-gpu" + "cluster-autoscaler" "operator" "istio-proxy" "istio-pilot" @@ -72,7 +73,6 @@ non_dev_images_cluster=( non_dev_images_aws=( # includes non_dev_images_cluster "tensorflow-serving-inf" - "cluster-autoscaler" "metrics-server" "inferentia" "neuron-rtd" diff --git a/cli/cmd/cluster_gcp.go b/cli/cmd/cluster_gcp.go index 33c48215d4..652ce85a0a 100644 --- a/cli/cmd/cluster_gcp.go +++ b/cli/cmd/cluster_gcp.go @@ -432,6 +432,11 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli gkeClusterParent := fmt.Sprintf("projects/%s/locations/%s", *clusterConfig.Project, *clusterConfig.Zone) gkeClusterName := fmt.Sprintf("%s/clusters/%s", gkeClusterParent, clusterConfig.ClusterName) + initialNodeCount := int64(1) + if *clusterConfig.MinInstances > 0 { + initialNodeCount = *clusterConfig.MinInstances + } + gkeClusterConfig := containerpb.Cluster{ Name: clusterConfig.ClusterName, InitialClusterVersion: "1.17", @@ -449,34 +454,56 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli }, InitialNodeCount: 1, }, - { - Name: "ng-cortex-worker-on-demand", - Config: &containerpb.NodeConfig{ - MachineType: *clusterConfig.InstanceType, - Labels: nodeLabels, - Taints: []*containerpb.NodeTaint{ - { - Key: "workload", - Value: "true", - Effect: containerpb.NodeTaint_NO_SCHEDULE, - }, + }, + Locations: []string{*clusterConfig.Zone}, + } + + if clusterConfig.Preemptible { + gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{ + Name: "ng-cortex-wk-preemp", + Config: &containerpb.NodeConfig{ + MachineType: *clusterConfig.InstanceType, + Labels: nodeLabels, + Taints: []*containerpb.NodeTaint{ + { + Key: "workload", + Value: "true", + Effect: containerpb.NodeTaint_NO_SCHEDULE, }, - Accelerators: accelerators, - OauthScopes: []string{ - "https://www.googleapis.com/auth/compute", - "https://www.googleapis.com/auth/devstorage.read_only", + }, + Accelerators: accelerators, + OauthScopes: []string{ + "https://www.googleapis.com/auth/compute", + "https://www.googleapis.com/auth/devstorage.read_only", + }, + ServiceAccount: gcpClient.ClientEmail, + Preemptible: true, + }, + InitialNodeCount: int32(initialNodeCount), + }) + } + if clusterConfig.OnDemandBackup || !clusterConfig.Preemptible { + gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{ + Name: "ng-cortex-wk-on-dmd", + Config: &containerpb.NodeConfig{ + MachineType: *clusterConfig.InstanceType, + Labels: nodeLabels, + Taints: []*containerpb.NodeTaint{ + { + Key: "workload", + Value: "true", + Effect: containerpb.NodeTaint_NO_SCHEDULE, }, - ServiceAccount: gcpClient.ClientEmail, }, - Autoscaling: &containerpb.NodePoolAutoscaling{ - Enabled: true, - MinNodeCount: int32(*clusterConfig.MinInstances), - MaxNodeCount: int32(*clusterConfig.MaxInstances), + Accelerators: accelerators, + OauthScopes: []string{ + "https://www.googleapis.com/auth/compute", + "https://www.googleapis.com/auth/devstorage.read_only", }, - InitialNodeCount: int32(*clusterConfig.MinInstances), + ServiceAccount: gcpClient.ClientEmail, }, - }, - Locations: []string{*clusterConfig.Zone}, + InitialNodeCount: int32(initialNodeCount), + }) } if clusterConfig.Network != nil { diff --git a/docs/clusters/gcp/install.md b/docs/clusters/gcp/install.md index 6d654e3f9b..227ea56b70 100644 --- a/docs/clusters/gcp/install.md +++ b/docs/clusters/gcp/install.md @@ -37,6 +37,13 @@ min_instances: 1 # maximum number of instances max_instances: 5 +# enable the use of preemptible instances +preemptible: false + +# enable the use of on-demand backup instances which will be used when preemptible capacity runs out +# default is true when preemptible instances are used +# on_demand_backup: true + # GPU to attach to your instance (optional) # accelerator_type: nvidia-tesla-t4 diff --git a/manager/install.sh b/manager/install.sh index 396f8071fc..3a1b379cc7 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -113,6 +113,11 @@ function cluster_up_gcp() { kubectl apply -f /workspace/apis.yaml >/dev/null echo "✓" + echo -n "○ configuring autoscaling " + python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/cluster-autoscaler.yaml.j2 > /workspace/cluster-autoscaler.yaml + kubectl apply -f /workspace/cluster-autoscaler.yaml >/dev/null + echo "✓" + echo -n "○ configuring logging " python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/fluent-bit.yaml.j2 > /workspace/fluent-bit.yaml kubectl apply -f /workspace/fluent-bit.yaml >/dev/null diff --git a/manager/manifests/cluster-autoscaler.yaml.j2 b/manager/manifests/cluster-autoscaler.yaml.j2 index 9cb2e99df7..360c8f9176 100644 --- a/manager/manifests/cluster-autoscaler.yaml.j2 +++ b/manager/manifests/cluster-autoscaler.yaml.j2 @@ -131,7 +131,7 @@ subjects: name: cluster-autoscaler namespace: kube-system --- -{% if config.get('spot_config') is not none and config['spot_config'].get('on_demand_backup', false) %} +{% if (config.get('spot_config') and config['spot_config'].get('on_demand_backup', false)) or config.get('on_demand_backup') %} apiVersion: v1 kind: ConfigMap metadata: @@ -139,10 +139,17 @@ metadata: namespace: kube-system data: priorities: |- + {% if config.get('spot_config') %} 10: - .*ng-cortex-worker-on-demand.* 50: - .*ng-cortex-worker-spot.* + {% else %} + 10: + - .*ng-cortex-wk-on-dmd.* + 50: + - .*ng-cortex-wk-preemp.* + {% endif %} --- {% endif %} apiVersion: apps/v1 @@ -177,9 +184,13 @@ spec: - ./cluster-autoscaler - --v=4 - --stderrthreshold=info + {% if config["provider"] == "aws" %} - --cloud-provider=aws + {% else %} + - --cloud-provider=gce + {% endif %} - --skip-nodes-with-local-storage=false - {% if config.get('spot_config') is not none and config['spot_config'].get('on_demand_backup', false) %} + {% if (config.get('spot_config') and config['spot_config'].get('on_demand_backup', false)) or config.get('on_demand_backup') %} - --expander=priority {% else %} - --expander=least-waste @@ -189,7 +200,11 @@ spec: - --ok-total-unready-count=30 - --max-node-provision-time=5m - --scan-interval=20s + {% if config["provider"] == "aws" %} - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/{{ config['cluster_name'] }} + {% else %} + - --node-group-auto-discovery=mig:namePrefix=gke-{{ config['cluster_name'] }}-ng-cortex-wk,min={{ config["min_instances"] }},max={{ config["max_instances"] }} + {% endif %} volumeMounts: - name: ssl-certs mountPath: /etc/ssl/certs/ca-certificates.crt @@ -198,7 +213,11 @@ spec: volumes: - name: ssl-certs hostPath: + {% if config["provider"] == "aws" %} path: "/etc/ssl/certs/ca-bundle.crt" + {% else %} + path: "/etc/ssl/certs/ca-certificates.crt" + {% endif %} strategy: type: RollingUpdate rollingUpdate: diff --git a/pkg/types/clusterconfig/cluster_config_aws.go b/pkg/types/clusterconfig/cluster_config_aws.go index 40e937a73e..e9625dde32 100644 --- a/pkg/types/clusterconfig/cluster_config_aws.go +++ b/pkg/types/clusterconfig/cluster_config_aws.go @@ -1155,7 +1155,6 @@ func (cc *Config) UserTable() table.KeyValuePairs { items.Add(InstanceVolumeTypeUserKey, cc.InstanceVolumeType) items.Add(InstanceVolumeIOPSUserKey, cc.InstanceVolumeIOPS) items.Add(SpotUserKey, s.YesNo(*cc.Spot)) - if cc.Spot != nil && *cc.Spot { items.Add(InstanceDistributionUserKey, cc.SpotConfig.InstanceDistribution) items.Add(OnDemandBaseCapacityUserKey, *cc.SpotConfig.OnDemandBaseCapacity) diff --git a/pkg/types/clusterconfig/cluster_config_gcp.go b/pkg/types/clusterconfig/cluster_config_gcp.go index 62821cdd12..ae50526e9e 100644 --- a/pkg/types/clusterconfig/cluster_config_gcp.go +++ b/pkg/types/clusterconfig/cluster_config_gcp.go @@ -28,6 +28,7 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/pointer" "github.com/cortexlabs/cortex/pkg/lib/prompt" "github.com/cortexlabs/cortex/pkg/lib/slices" + s "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/lib/table" "github.com/cortexlabs/cortex/pkg/types" ) @@ -45,11 +46,14 @@ type GCPConfig struct { OperatorLoadBalancerScheme LoadBalancerScheme `json:"operator_load_balancer_scheme" yaml:"operator_load_balancer_scheme"` MinInstances *int64 `json:"min_instances" yaml:"min_instances"` MaxInstances *int64 `json:"max_instances" yaml:"max_instances"` + Preemptible bool `json:"preemptible" yaml:"preemptible"` + OnDemandBackup bool `json:"on_demand_backup" yaml:"on_demand_backup"` ClusterName string `json:"cluster_name" yaml:"cluster_name"` Telemetry bool `json:"telemetry" yaml:"telemetry"` ImageOperator string `json:"image_operator" yaml:"image_operator"` ImageManager string `json:"image_manager" yaml:"image_manager"` ImageDownloader string `json:"image_downloader" yaml:"image_downloader"` + ImageClusterAutoscaler string `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"` ImageFluentBit string `json:"image_fluent_bit" yaml:"image_fluent_bit"` ImageIstioProxy string `json:"image_istio_proxy" yaml:"image_istio_proxy"` ImageIstioPilot string `json:"image_istio_pilot" yaml:"image_istio_pilot"` @@ -206,6 +210,20 @@ var UserGCPValidation = &cr.StructValidation{ Validator: validateClusterName, }, }, + { + StructField: "Preemptible", + BoolValidation: &cr.BoolValidation{ + Default: false, + }, + }, + { + StructField: "OnDemandBackup", + DefaultDependentFields: []string{"Preemptible"}, + DefaultDependentFieldsFunc: func(vals []interface{}) interface{} { + return vals[0].(bool) + }, + BoolValidation: &cr.BoolValidation{}, + }, { StructField: "Project", StringPtrValidation: &cr.StringPtrValidation{}, @@ -235,6 +253,13 @@ var UserGCPValidation = &cr.StructValidation{ Validator: validateImageVersion, }, }, + { + StructField: "ImageClusterAutoscaler", + StringValidation: &cr.StringValidation{ + Default: "quay.io/cortexlabs/cluster-austoscaler:" + consts.CortexVersion, + Validator: validateImageVersion, + }, + }, { StructField: "ImageFluentBit", StringValidation: &cr.StringValidation{ @@ -387,6 +412,10 @@ func (cc *GCPConfig) Validate(GCP *gcp.Client) error { } } + if !cc.Preemptible && cc.OnDemandBackup { + return ErrorFieldConfigurationDependentOnCondition(OnDemandBackupKey, s.Bool(cc.OnDemandBackup), PreemptibleKey, s.Bool(cc.Preemptible)) + } + return nil } @@ -490,6 +519,7 @@ func SetGCPDefaults(cc *GCPConfig) error { if errors.HasError(errs) { return errors.FirstError(errs...) } + return nil } @@ -542,6 +572,8 @@ func (cc *GCPConfig) UserTable() table.KeyValuePairs { if cc.AcceleratorsPerInstance != nil { items.Add(AcceleratorsPerInstanceUserKey, *cc.AcceleratorsPerInstance) } + items.Add(PreemptibleUserKey, s.YesNo(cc.Preemptible)) + items.Add(OnDemandBackupUserKey, s.YesNo(cc.OnDemandBackup)) if cc.Network != nil { items.Add(NetworkUserKey, *cc.Network) } @@ -554,6 +586,7 @@ func (cc *GCPConfig) UserTable() table.KeyValuePairs { items.Add(ImageOperatorUserKey, cc.ImageOperator) items.Add(ImageManagerUserKey, cc.ImageManager) items.Add(ImageDownloaderUserKey, cc.ImageDownloader) + items.Add(ImageClusterAutoscalerUserKey, cc.ImageClusterAutoscaler) items.Add(ImageFluentBitUserKey, cc.ImageFluentBit) items.Add(ImageIstioProxyUserKey, cc.ImageIstioProxy) items.Add(ImageIstioPilotUserKey, cc.ImageIstioPilot) @@ -602,6 +635,8 @@ func (cc *GCPConfig) TelemetryEvent() map[string]interface{} { if cc.ClusterName != "cortex" { event["cluster_name._is_custom"] = true } + event["preemptible"] = cc.Preemptible + event["on_demand_backup"] = cc.OnDemandBackup if cc.Zone != nil { event["zone._is_defined"] = true event["zone"] = *cc.Zone @@ -615,6 +650,9 @@ func (cc *GCPConfig) TelemetryEvent() map[string]interface{} { if !strings.HasPrefix(cc.ImageDownloader, "cortexlabs/") { event["image_downloader._is_custom"] = true } + if !strings.HasPrefix(cc.ImageClusterAutoscaler, "cortexlabs/") { + event["image_cluster_autoscaler._is_custom"] = true + } if !strings.HasPrefix(cc.ImageFluentBit, "cortexlabs/") { event["image_fluent_bit._is_custom"] = true } diff --git a/pkg/types/clusterconfig/config_key.go b/pkg/types/clusterconfig/config_key.go index 72ee7f0624..17004347e1 100644 --- a/pkg/types/clusterconfig/config_key.go +++ b/pkg/types/clusterconfig/config_key.go @@ -31,6 +31,7 @@ const ( InstanceVolumeIOPSKey = "instance_volume_iops" SpotKey = "spot" SpotConfigKey = "spot_config" + PreemptibleKey = "preemptible" InstanceDistributionKey = "instance_distribution" OnDemandBaseCapacityKey = "on_demand_base_capacity" OnDemandPercentageAboveBaseCapacityKey = "on_demand_percentage_above_base_capacity" @@ -82,6 +83,7 @@ const ( SSLCertificateARNUserKey = "ssl certificate arn" BucketUserKey = "s3 bucket" SpotUserKey = "use spot instances" + PreemptibleUserKey = "use preemptible instances" InstanceTypeUserKey = "instance type" AcceleratorTypeUserKey = "accelerator type" AcceleratorsPerInstanceUserKey = "accelerators per instance" diff --git a/pkg/types/clusterconfig/errors.go b/pkg/types/clusterconfig/errors.go index 63db629382..6aac12e306 100644 --- a/pkg/types/clusterconfig/errors.go +++ b/pkg/types/clusterconfig/errors.go @@ -52,6 +52,7 @@ const ( ErrNoNATGatewayWithSubnets = "clusterconfig.no_nat_gateway_with_subnets" ErrSpecifyOneOrNone = "clusterconfig.specify_one_or_none" ErrDependentFieldMustBeSpecified = "clusterconfig.dependent_field_must_be_specified" + ErrFieldConfigurationDependentOnCondition = "clusterconfig.field_configuration_dependent_on_condition" ErrDidNotMatchStrictS3Regex = "clusterconfig.did_not_match_strict_s3_regex" ErrNATRequiredWithPrivateSubnetVisibility = "clusterconfig.nat_required_with_private_subnet_visibility" ErrS3RegionDiffersFromCluster = "clusterconfig.s3_region_differs_from_cluster" @@ -249,6 +250,13 @@ func ErrorDependentFieldMustBeSpecified(configuredField string, dependencyField }) } +func ErrorFieldConfigurationDependentOnCondition(configuredField string, configuredFieldValue string, dependencyField string, dependencyFieldValue string) error { + return errors.WithStack(&errors.Error{ + Kind: ErrFieldConfigurationDependentOnCondition, + Message: fmt.Sprintf("cannot set %s = %s when %s = %s", configuredField, configuredFieldValue, dependencyField, dependencyFieldValue), + }) +} + func ErrorDidNotMatchStrictS3Regex() error { return errors.WithStack(&errors.Error{ Kind: ErrDidNotMatchStrictS3Regex,