From 8244c4e988aec065079acff63ef8103e9375a931 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 16 Jan 2021 00:35:41 +0200 Subject: [PATCH 01/11] WIP on GCP preemptibles --- cli/cmd/cluster_gcp.go | 92 +++++++++---- manager/install.sh | 5 + manager/manifests/cluster-autoscaler.yaml.j2 | 25 +++- pkg/types/clusterconfig/cluster_config_aws.go | 5 +- pkg/types/clusterconfig/cluster_config_gcp.go | 124 +++++++++++++++--- pkg/types/clusterconfig/config_key.go | 3 + pkg/types/clusterconfig/errors.go | 73 ++++++----- 7 files changed, 250 insertions(+), 77 deletions(-) diff --git a/cli/cmd/cluster_gcp.go b/cli/cmd/cluster_gcp.go index 93e0ad632b..a07df18209 100644 --- a/cli/cmd/cluster_gcp.go +++ b/cli/cmd/cluster_gcp.go @@ -432,6 +432,11 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli gkeClusterParent := fmt.Sprintf("projects/%s/locations/%s", *clusterConfig.Project, *clusterConfig.Zone) gkeClusterName := fmt.Sprintf("%s/clusters/%s", gkeClusterParent, clusterConfig.ClusterName) + initialNodeCount := int64(1) + if *clusterConfig.MinInstances > 0 { + initialNodeCount = *clusterConfig.MinInstances + } + gkeClusterConfig := containerpb.Cluster{ Name: clusterConfig.ClusterName, InitialClusterVersion: "1.17", @@ -449,34 +454,75 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli }, InitialNodeCount: 1, }, - { - Name: "ng-cortex-worker-on-demand", - Config: &containerpb.NodeConfig{ - MachineType: *clusterConfig.InstanceType, - Labels: nodeLabels, - Taints: []*containerpb.NodeTaint{ - { - Key: "workload", - Value: "true", - Effect: containerpb.NodeTaint_NO_SCHEDULE, - }, + }, + Locations: []string{*clusterConfig.Zone}, + } + + onDemandInitialNodeCount := initialNodeCount + onDemandRequired := false + if clusterConfig.Preemptible == nil { + onDemandRequired = true + } + + if clusterConfig.Preemptible != nil && *clusterConfig.Preemptible && clusterConfig.PreemptibleConfig != nil { + if clusterConfig.Preemptible != nil && *clusterConfig.Preemptible && clusterConfig.PreemptibleConfig != nil { + if clusterConfig.PreemptibleConfig.OnDemandBackup != nil && *clusterConfig.PreemptibleConfig.OnDemandBackup { + onDemandRequired = true + } + if clusterConfig.PreemptibleConfig.OnDemandBaseCapacity != nil && *clusterConfig.PreemptibleConfig.OnDemandBaseCapacity > 0 { + onDemandInitialNodeCount = *clusterConfig.PreemptibleConfig.OnDemandBaseCapacity + } + } + } + if onDemandRequired || onDemandInitialNodeCount > 0 { + gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{ + Name: "ng-cortex-wk-on-dmd", + Config: &containerpb.NodeConfig{ + MachineType: *clusterConfig.InstanceType, + Labels: nodeLabels, + Taints: []*containerpb.NodeTaint{ + { + Key: "workload", + Value: "true", + Effect: containerpb.NodeTaint_NO_SCHEDULE, }, - Accelerators: accelerators, - OauthScopes: []string{ - "https://www.googleapis.com/auth/compute", - "https://www.googleapis.com/auth/devstorage.read_only", + }, + Accelerators: accelerators, + OauthScopes: []string{ + "https://www.googleapis.com/auth/compute", + "https://www.googleapis.com/auth/devstorage.read_only", + }, + }, + InitialNodeCount: int32(onDemandInitialNodeCount), + }) + } + + if onDemandInitialNodeCount > 0 && initialNodeCount-onDemandInitialNodeCount > 0 { + initialNodeCount -= onDemandInitialNodeCount + } + + if clusterConfig.Preemptible != nil && *clusterConfig.Preemptible { + gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{ + Name: "ng-cortex-wk-preemp", + Config: &containerpb.NodeConfig{ + MachineType: *clusterConfig.InstanceType, + Labels: nodeLabels, + Taints: []*containerpb.NodeTaint{ + { + Key: "workload", + Value: "true", + Effect: containerpb.NodeTaint_NO_SCHEDULE, }, - ServiceAccount: gcpClient.ClientEmail, }, - Autoscaling: &containerpb.NodePoolAutoscaling{ - Enabled: true, - MinNodeCount: int32(*clusterConfig.MinInstances), - MaxNodeCount: int32(*clusterConfig.MaxInstances), + Accelerators: accelerators, + OauthScopes: []string{ + "https://www.googleapis.com/auth/compute", + "https://www.googleapis.com/auth/devstorage.read_only", }, - InitialNodeCount: int32(*clusterConfig.MinInstances), + ServiceAccount: gcpClient.ClientEmail, }, - }, - Locations: []string{*clusterConfig.Zone}, + InitialNodeCount: int32(initialNodeCount), + }) } if clusterConfig.Network != nil { diff --git a/manager/install.sh b/manager/install.sh index e1629a3d0e..eb30916c57 100755 --- a/manager/install.sh +++ b/manager/install.sh @@ -113,6 +113,11 @@ function cluster_up_gcp() { kubectl apply -f /workspace/apis.yaml >/dev/null echo "✓" + echo -n "○ configuring autoscaling " + python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/cluster-autoscaler.yaml.j2 > /workspace/cluster-autoscaler.yaml + kubectl apply -f /workspace/cluster-autoscaler.yaml >/dev/null + echo "✓" + echo -n "○ configuring logging " python render_template.py $CORTEX_CLUSTER_CONFIG_FILE manifests/fluent-bit.yaml.j2 > /workspace/fluent-bit.yaml kubectl apply -f /workspace/fluent-bit.yaml >/dev/null diff --git a/manager/manifests/cluster-autoscaler.yaml.j2 b/manager/manifests/cluster-autoscaler.yaml.j2 index 9cb2e99df7..b69b85af5d 100644 --- a/manager/manifests/cluster-autoscaler.yaml.j2 +++ b/manager/manifests/cluster-autoscaler.yaml.j2 @@ -131,7 +131,7 @@ subjects: name: cluster-autoscaler namespace: kube-system --- -{% if config.get('spot_config') is not none and config['spot_config'].get('on_demand_backup', false) %} +{% if (config.get('spot_config') and config['spot_config'].get('on_demand_backup', false)) or (config.get('preemptible_config') and config['preemptible_config'].get('on_demand_backup', false)) %} apiVersion: v1 kind: ConfigMap metadata: @@ -139,10 +139,17 @@ metadata: namespace: kube-system data: priorities: |- + {% if config.get('spot_config') %} 10: - .*ng-cortex-worker-on-demand.* 50: - .*ng-cortex-worker-spot.* + {% elif config.get('preemptible_config') %} + 10: + - .*ng-cortex-wk-on-dmd.* + 50: + - .*ng-cortex-wk-preemp.* + {% endif %} --- {% endif %} apiVersion: apps/v1 @@ -177,9 +184,15 @@ spec: - ./cluster-autoscaler - --v=4 - --stderrthreshold=info + - --scale-down-delay-after-add=10s + - --scale-down-unneeded-time=20s + {% if config["provider"] == "aws" %} - --cloud-provider=aws + {% else %} + - --cloud-provider=gce + {% endif %} - --skip-nodes-with-local-storage=false - {% if config.get('spot_config') is not none and config['spot_config'].get('on_demand_backup', false) %} + {% if (config.get('spot_config') and config['spot_config'].get('on_demand_backup', false)) or (config.get('preemptible_config') and config['preemptible_config'].get('on_demand_backup', false)) %} - --expander=priority {% else %} - --expander=least-waste @@ -189,7 +202,11 @@ spec: - --ok-total-unready-count=30 - --max-node-provision-time=5m - --scan-interval=20s + {% if config["provider"] == "aws" %} - --node-group-auto-discovery=asg:tag=k8s.io/cluster-autoscaler/enabled,k8s.io/cluster-autoscaler/{{ config['cluster_name'] }} + {% else %} + - --node-group-auto-discovery=mig:namePrefix=gke-{{ config['cluster_name'] }}-ng-cortex-wk,min={{ config["min_instances"] }},max={{ config["max_instances"] }} + {% endif %} volumeMounts: - name: ssl-certs mountPath: /etc/ssl/certs/ca-certificates.crt @@ -198,7 +215,11 @@ spec: volumes: - name: ssl-certs hostPath: + {% if config["provider"] == "aws" %} path: "/etc/ssl/certs/ca-bundle.crt" + {% else %} + path: "/etc/ssl/certs/ca-certificates.crt" + {% endif %} strategy: type: RollingUpdate rollingUpdate: diff --git a/pkg/types/clusterconfig/cluster_config_aws.go b/pkg/types/clusterconfig/cluster_config_aws.go index 40e937a73e..b3ce6e8a22 100644 --- a/pkg/types/clusterconfig/cluster_config_aws.go +++ b/pkg/types/clusterconfig/cluster_config_aws.go @@ -1154,8 +1154,9 @@ func (cc *Config) UserTable() table.KeyValuePairs { items.Add(InstanceVolumeSizeUserKey, cc.InstanceVolumeSize) items.Add(InstanceVolumeTypeUserKey, cc.InstanceVolumeType) items.Add(InstanceVolumeIOPSUserKey, cc.InstanceVolumeIOPS) - items.Add(SpotUserKey, s.YesNo(*cc.Spot)) - + if cc.Spot != nil { + items.Add(SpotUserKey, s.YesNo(*cc.Spot)) + } if cc.Spot != nil && *cc.Spot { items.Add(InstanceDistributionUserKey, cc.SpotConfig.InstanceDistribution) items.Add(OnDemandBaseCapacityUserKey, *cc.SpotConfig.OnDemandBaseCapacity) diff --git a/pkg/types/clusterconfig/cluster_config_gcp.go b/pkg/types/clusterconfig/cluster_config_gcp.go index 63cbb26a27..67a10288d9 100644 --- a/pkg/types/clusterconfig/cluster_config_gcp.go +++ b/pkg/types/clusterconfig/cluster_config_gcp.go @@ -28,29 +28,38 @@ import ( "github.com/cortexlabs/cortex/pkg/lib/pointer" "github.com/cortexlabs/cortex/pkg/lib/prompt" "github.com/cortexlabs/cortex/pkg/lib/slices" + s "github.com/cortexlabs/cortex/pkg/lib/strings" "github.com/cortexlabs/cortex/pkg/lib/table" "github.com/cortexlabs/cortex/pkg/types" ) type GCPConfig struct { - Provider types.ProviderType `json:"provider" yaml:"provider"` - Project *string `json:"project" yaml:"project"` - Zone *string `json:"zone" yaml:"zone"` - InstanceType *string `json:"instance_type" yaml:"instance_type"` - AcceleratorType *string `json:"accelerator_type" yaml:"accelerator_type"` - Network *string `json:"network" yaml:"network"` - Subnet *string `json:"subnet" yaml:"subnet"` - MinInstances *int64 `json:"min_instances" yaml:"min_instances"` - MaxInstances *int64 `json:"max_instances" yaml:"max_instances"` - ClusterName string `json:"cluster_name" yaml:"cluster_name"` - Telemetry bool `json:"telemetry" yaml:"telemetry"` - ImageOperator string `json:"image_operator" yaml:"image_operator"` - ImageManager string `json:"image_manager" yaml:"image_manager"` - ImageDownloader string `json:"image_downloader" yaml:"image_downloader"` - ImageFluentBit string `json:"image_fluent_bit" yaml:"image_fluent_bit"` - ImageIstioProxy string `json:"image_istio_proxy" yaml:"image_istio_proxy"` - ImageIstioPilot string `json:"image_istio_pilot" yaml:"image_istio_pilot"` - ImageGooglePause string `json:"image_google_pause" yaml:"image_google_pause"` + Provider types.ProviderType `json:"provider" yaml:"provider"` + Project *string `json:"project" yaml:"project"` + Zone *string `json:"zone" yaml:"zone"` + InstanceType *string `json:"instance_type" yaml:"instance_type"` + AcceleratorType *string `json:"accelerator_type" yaml:"accelerator_type"` + Network *string `json:"network" yaml:"network"` + Subnet *string `json:"subnet" yaml:"subnet"` + MinInstances *int64 `json:"min_instances" yaml:"min_instances"` + MaxInstances *int64 `json:"max_instances" yaml:"max_instances"` + Preemptible *bool `json:"preemptible" yaml:"preemptible"` + PreemptibleConfig *PreemptibleConfig `json:"preemptible_config" yaml:"preemptible_config"` + ClusterName string `json:"cluster_name" yaml:"cluster_name"` + Telemetry bool `json:"telemetry" yaml:"telemetry"` + ImageOperator string `json:"image_operator" yaml:"image_operator"` + ImageManager string `json:"image_manager" yaml:"image_manager"` + ImageDownloader string `json:"image_downloader" yaml:"image_downloader"` + ImageClusterAutoscaler string `json:"image_cluster_autoscaler" yaml:"image_cluster_autoscaler"` + ImageFluentBit string `json:"image_fluent_bit" yaml:"image_fluent_bit"` + ImageIstioProxy string `json:"image_istio_proxy" yaml:"image_istio_proxy"` + ImageIstioPilot string `json:"image_istio_pilot" yaml:"image_istio_pilot"` + ImageGooglePause string `json:"image_google_pause" yaml:"image_google_pause"` +} + +type PreemptibleConfig struct { + OnDemandBaseCapacity *int64 `json:"on_demand_base_capacity" yaml:"on_demand_base_capacity"` + OnDemandBackup *bool `json:"on_demand_backup" yaml:"on_demand_backup"` } type InternalGCPConfig struct { @@ -169,6 +178,34 @@ var UserGCPValidation = &cr.StructValidation{ Validator: validateClusterName, }, }, + { + StructField: "Preemptible", + BoolPtrValidation: &cr.BoolPtrValidation{ + Default: pointer.Bool(false), + }, + }, + { + StructField: "PreemptibleConfig", + StructValidation: &cr.StructValidation{ + DefaultNil: true, + AllowExplicitNull: true, + StructFieldValidations: []*cr.StructFieldValidation{ + { + StructField: "OnDemandBaseCapacity", + Int64PtrValidation: &cr.Int64PtrValidation{ + GreaterThanOrEqualTo: pointer.Int64(0), + AllowExplicitNull: true, + }, + }, + { + StructField: "OnDemandBackup", + BoolPtrValidation: &cr.BoolPtrValidation{ + Default: pointer.Bool(true), + }, + }, + }, + }, + }, { StructField: "Project", StringPtrValidation: &cr.StringPtrValidation{}, @@ -198,6 +235,13 @@ var UserGCPValidation = &cr.StructValidation{ Validator: validateImageVersion, }, }, + { + StructField: "ImageClusterAutoscaler", + StringValidation: &cr.StringValidation{ + Default: "quay.io/cortexlabs/cluster-austoscaler:" + consts.CortexVersion, + Validator: validateImageVersion, + }, + }, { StructField: "ImageFluentBit", StringValidation: &cr.StringValidation{ @@ -343,6 +387,26 @@ func (cc *GCPConfig) Validate(GCP *gcp.Client) error { } } + if cc.Preemptible != nil && *cc.Preemptible { + if cc.PreemptibleConfig == nil { + cc.PreemptibleConfig = &PreemptibleConfig{} + } + if cc.PreemptibleConfig.OnDemandBaseCapacity == nil { + cc.PreemptibleConfig.OnDemandBaseCapacity = pointer.Int64(0) + } + if cc.PreemptibleConfig.OnDemandBackup == nil { + cc.PreemptibleConfig.OnDemandBackup = pointer.Bool(true) + } + if *cc.PreemptibleConfig.OnDemandBaseCapacity > *cc.MaxInstances { + return ErrorOnDemandBaseCapacityGreaterThanMax(**&cc.PreemptibleConfig.OnDemandBaseCapacity, *cc.MaxInstances) + } + + } else { + if cc.PreemptibleConfig != nil { + return ErrorGCPConfiguredWhenPreemptibleIsNotEnabled(PreemptibleConfigKey) + } + } + return nil } @@ -495,6 +559,13 @@ func (cc *GCPConfig) UserTable() table.KeyValuePairs { if cc.AcceleratorType != nil { items.Add(AcceleratorTypeUserKey, *cc.AcceleratorType) } + if cc.Preemptible != nil { + items.Add(PreemptibleUserKey, s.YesNo(*cc.Preemptible)) + } + if cc.Preemptible != nil && *cc.Preemptible { + items.Add(OnDemandBaseCapacityUserKey, *cc.PreemptibleConfig.OnDemandBaseCapacity) + items.Add(OnDemandBackupUserKey, s.YesNo(*cc.PreemptibleConfig.OnDemandBackup)) + } if cc.Network != nil { items.Add(NetworkUserKey, *cc.Network) } @@ -505,6 +576,7 @@ func (cc *GCPConfig) UserTable() table.KeyValuePairs { items.Add(ImageOperatorUserKey, cc.ImageOperator) items.Add(ImageManagerUserKey, cc.ImageManager) items.Add(ImageDownloaderUserKey, cc.ImageDownloader) + items.Add(ImageClusterAutoscalerUserKey, cc.ImageClusterAutoscaler) items.Add(ImageFluentBitUserKey, cc.ImageFluentBit) items.Add(ImageIstioProxyUserKey, cc.ImageIstioProxy) items.Add(ImageIstioPilotUserKey, cc.ImageIstioPilot) @@ -547,6 +619,19 @@ func (cc *GCPConfig) TelemetryEvent() map[string]interface{} { if cc.ClusterName != "cortex" { event["cluster_name._is_custom"] = true } + if cc.Preemptible != nil { + event["preemptible._is_defined"] = true + event["preemptible"] = *cc.Preemptible + } + if cc.PreemptibleConfig != nil { + event["preemptible_config._is_defined"] = true + if cc.PreemptibleConfig.OnDemandBaseCapacity != nil { + event["preemptible_config.on_demand_base_capacity"] = *cc.PreemptibleConfig.OnDemandBaseCapacity + } + if cc.PreemptibleConfig.OnDemandBackup != nil { + event["preemptible_config.on_demand_backup"] = *cc.PreemptibleConfig.OnDemandBackup + } + } if cc.Zone != nil { event["zone._is_defined"] = true event["zone"] = *cc.Zone @@ -560,6 +645,9 @@ func (cc *GCPConfig) TelemetryEvent() map[string]interface{} { if !strings.HasPrefix(cc.ImageDownloader, "cortexlabs/") { event["image_downloader._is_custom"] = true } + if !strings.HasPrefix(cc.ImageClusterAutoscaler, "cortexlabs/") { + event["image_cluster_autoscaler._is_custom"] = true + } if !strings.HasPrefix(cc.ImageFluentBit, "cortexlabs/") { event["image_fluent_bit._is_custom"] = true } diff --git a/pkg/types/clusterconfig/config_key.go b/pkg/types/clusterconfig/config_key.go index 4cf8f5de74..324d8e6444 100644 --- a/pkg/types/clusterconfig/config_key.go +++ b/pkg/types/clusterconfig/config_key.go @@ -30,6 +30,8 @@ const ( InstanceVolumeIOPSKey = "instance_volume_iops" SpotKey = "spot" SpotConfigKey = "spot_config" + PreemptibleKey = "preemptible" + PreemptibleConfigKey = "preemptible_config" InstanceDistributionKey = "instance_distribution" OnDemandBaseCapacityKey = "on_demand_base_capacity" OnDemandPercentageAboveBaseCapacityKey = "on_demand_percentage_above_base_capacity" @@ -81,6 +83,7 @@ const ( SSLCertificateARNUserKey = "ssl certificate arn" BucketUserKey = "s3 bucket" SpotUserKey = "use spot instances" + PreemptibleUserKey = "user preemptible instances" InstanceTypeUserKey = "instance type" AcceleratorTypeUserKey = "accelerator type" NetworkUserKey = "network" diff --git a/pkg/types/clusterconfig/errors.go b/pkg/types/clusterconfig/errors.go index 13887254f6..10c9fabbf3 100644 --- a/pkg/types/clusterconfig/errors.go +++ b/pkg/types/clusterconfig/errors.go @@ -28,44 +28,46 @@ import ( ) const ( - ErrInvalidRegion = "clusterconfig.invalid_region" - ErrInstanceTypeTooSmall = "clusterconfig.instance_type_too_small" - ErrMinInstancesGreaterThanMax = "clusterconfig.min_instances_greater_than_max" - ErrInstanceTypeNotSupportedInRegion = "clusterconfig.instance_type_not_supported_in_region" - ErrIncompatibleSpotInstanceTypeMemory = "clusterconfig.incompatible_spot_instance_type_memory" - ErrIncompatibleSpotInstanceTypeCPU = "clusterconfig.incompatible_spot_instance_type_cpu" - ErrIncompatibleSpotInstanceTypeGPU = "clusterconfig.incompatible_spot_instance_type_gpu" - ErrIncompatibleSpotInstanceTypeInf = "clusterconfig.incompatible_spot_instance_type_inf" - ErrSpotPriceGreaterThanTargetOnDemand = "clusterconfig.spot_price_greater_than_target_on_demand" - ErrSpotPriceGreaterThanMaxPrice = "clusterconfig.spot_price_greater_than_max_price" - ErrInstanceTypeNotSupported = "clusterconfig.instance_type_not_supported" - ErrARMInstancesNotSupported = "clusterconfig.arm_instances_not_supported" - ErrAtLeastOneInstanceDistribution = "clusterconfig.at_least_one_instance_distribution" - ErrNoCompatibleSpotInstanceFound = "clusterconfig.no_compatible_spot_instance_found" - ErrConfiguredWhenSpotIsNotEnabled = "clusterconfig.configured_when_spot_is_not_enabled" - ErrOnDemandBaseCapacityGreaterThanMax = "clusterconfig.on_demand_base_capacity_greater_than_max" - ErrConfigCannotBeChangedOnUpdate = "clusterconfig.config_cannot_be_changed_on_update" - ErrInvalidAvailabilityZone = "clusterconfig.invalid_availability_zone" - ErrAvailabilityZoneSpecifiedTwice = "clusterconfig.availability_zone_specified_twice" - ErrUnsupportedAvailabilityZone = "clusterconfig.unsupported_availability_zone" - ErrNotEnoughValidDefaultAvailibilityZones = "clusterconfig.not_enough_valid_default_availability_zones" - ErrNoNATGatewayWithSubnets = "clusterconfig.no_nat_gateway_with_subnets" - ErrSpecifyOneOrNone = "clusterconfig.specify_one_or_none" - ErrDidNotMatchStrictS3Regex = "clusterconfig.did_not_match_strict_s3_regex" - ErrNATRequiredWithPrivateSubnetVisibility = "clusterconfig.nat_required_with_private_subnet_visibility" - ErrS3RegionDiffersFromCluster = "clusterconfig.s3_region_differs_from_cluster" - ErrInvalidInstanceType = "clusterconfig.invalid_instance_type" - ErrIOPSNotSupported = "clusterconfig.iops_not_supported" - ErrIOPSTooLarge = "clusterconfig.iops_too_large" - ErrCantOverrideDefaultTag = "clusterconfig.cant_override_default_tag" - ErrSSLCertificateARNNotFound = "clusterconfig.ssl_certificate_arn_not_found" - ErrProviderMismatch = "clusterconfig.provider_mismatch" + ErrInvalidRegion = "clusterconfig.invalid_region" + ErrInstanceTypeTooSmall = "clusterconfig.instance_type_too_small" + ErrMinInstancesGreaterThanMax = "clusterconfig.min_instances_greater_than_max" + ErrInstanceTypeNotSupportedInRegion = "clusterconfig.instance_type_not_supported_in_region" + ErrIncompatibleSpotInstanceTypeMemory = "clusterconfig.incompatible_spot_instance_type_memory" + ErrIncompatibleSpotInstanceTypeCPU = "clusterconfig.incompatible_spot_instance_type_cpu" + ErrIncompatibleSpotInstanceTypeGPU = "clusterconfig.incompatible_spot_instance_type_gpu" + ErrIncompatibleSpotInstanceTypeInf = "clusterconfig.incompatible_spot_instance_type_inf" + ErrSpotPriceGreaterThanTargetOnDemand = "clusterconfig.spot_price_greater_than_target_on_demand" + ErrSpotPriceGreaterThanMaxPrice = "clusterconfig.spot_price_greater_than_max_price" + ErrInstanceTypeNotSupported = "clusterconfig.instance_type_not_supported" + ErrARMInstancesNotSupported = "clusterconfig.arm_instances_not_supported" + ErrAtLeastOneInstanceDistribution = "clusterconfig.at_least_one_instance_distribution" + ErrNoCompatibleSpotInstanceFound = "clusterconfig.no_compatible_spot_instance_found" + ErrConfiguredWhenSpotIsNotEnabled = "clusterconfig.configured_when_spot_is_not_enabled" + ErrOnDemandBaseCapacityGreaterThanMax = "clusterconfig.on_demand_base_capacity_greater_than_max" + ErrConfigCannotBeChangedOnUpdate = "clusterconfig.config_cannot_be_changed_on_update" + ErrInvalidAvailabilityZone = "clusterconfig.invalid_availability_zone" + ErrAvailabilityZoneSpecifiedTwice = "clusterconfig.availability_zone_specified_twice" + ErrUnsupportedAvailabilityZone = "clusterconfig.unsupported_availability_zone" + ErrNotEnoughValidDefaultAvailibilityZones = "clusterconfig.not_enough_valid_default_availability_zones" + ErrNoNATGatewayWithSubnets = "clusterconfig.no_nat_gateway_with_subnets" + ErrSpecifyOneOrNone = "clusterconfig.specify_one_or_none" + ErrDidNotMatchStrictS3Regex = "clusterconfig.did_not_match_strict_s3_regex" + ErrNATRequiredWithPrivateSubnetVisibility = "clusterconfig.nat_required_with_private_subnet_visibility" + ErrS3RegionDiffersFromCluster = "clusterconfig.s3_region_differs_from_cluster" + ErrInvalidInstanceType = "clusterconfig.invalid_instance_type" + ErrIOPSNotSupported = "clusterconfig.iops_not_supported" + ErrIOPSTooLarge = "clusterconfig.iops_too_large" + ErrCantOverrideDefaultTag = "clusterconfig.cant_override_default_tag" + ErrSSLCertificateARNNotFound = "clusterconfig.ssl_certificate_arn_not_found" + ErrProviderMismatch = "clusterconfig.provider_mismatch" + ErrGCPInvalidProjectID = "clusterconfig.gcp_invalid_project_id" ErrGCPProjectMustBeSpecified = "clusterconfig.gcp_project_must_be_specified" ErrGCPInvalidZone = "clusterconfig.gcp_invalid_zone" ErrGCPInvalidInstanceType = "clusterconfig.gcp_invalid_instance_type" ErrGCPInvalidAcceleratorType = "clusterconfig.gcp_invalid_accelerator_type" ErrGCPIncompatibleInstanceTypeWithAccelerator = "clusterconfig.gcp_incompatible_instance_type_with_accelerator" + ErrGCPConfiguredWhenPreemptibleIsNotEnabled = "clusterconfig.gcp_configured_when_preemptible_is_not_enabled" ) func ErrorInvalidRegion(region string) error { @@ -366,3 +368,10 @@ func ErrorGCPIncompatibleInstanceTypeWithAccelerator(instanceType, acceleratorTy Message: fmt.Sprintf("instance type %s is incompatible with the %s accelerator; the following instance types are compatible with the %s accelerator in zone %s: %s", instanceType, acceleratorType, acceleratorType, zone, s.StrsOr(compatibleInstances)), }) } + +func ErrorGCPConfiguredWhenPreemptibleIsNotEnabled(configKey string) error { + return errors.WithStack(&errors.Error{ + Kind: ErrGCPConfiguredWhenPreemptibleIsNotEnabled, + Message: fmt.Sprintf("%s cannot be specified unless preemptible is enabled (to enable preemptible instances, set `%s: true` in your cluster configuration file)", configKey, PreemptibleKey), + }) +} From e53a74980b211c0b367ed67defb70a6579a810f0 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 16 Jan 2021 01:52:53 +0200 Subject: [PATCH 02/11] WIP on GCP preemptibles --- cli/cmd/cluster_gcp.go | 34 ++------ manager/manifests/cluster-autoscaler.yaml.j2 | 6 +- pkg/types/clusterconfig/cluster_config_gcp.go | 79 +++---------------- 3 files changed, 23 insertions(+), 96 deletions(-) diff --git a/cli/cmd/cluster_gcp.go b/cli/cmd/cluster_gcp.go index a07df18209..7370086bc5 100644 --- a/cli/cmd/cluster_gcp.go +++ b/cli/cmd/cluster_gcp.go @@ -458,25 +458,9 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli Locations: []string{*clusterConfig.Zone}, } - onDemandInitialNodeCount := initialNodeCount - onDemandRequired := false - if clusterConfig.Preemptible == nil { - onDemandRequired = true - } - - if clusterConfig.Preemptible != nil && *clusterConfig.Preemptible && clusterConfig.PreemptibleConfig != nil { - if clusterConfig.Preemptible != nil && *clusterConfig.Preemptible && clusterConfig.PreemptibleConfig != nil { - if clusterConfig.PreemptibleConfig.OnDemandBackup != nil && *clusterConfig.PreemptibleConfig.OnDemandBackup { - onDemandRequired = true - } - if clusterConfig.PreemptibleConfig.OnDemandBaseCapacity != nil && *clusterConfig.PreemptibleConfig.OnDemandBaseCapacity > 0 { - onDemandInitialNodeCount = *clusterConfig.PreemptibleConfig.OnDemandBaseCapacity - } - } - } - if onDemandRequired || onDemandInitialNodeCount > 0 { + if clusterConfig.Preemptible { gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{ - Name: "ng-cortex-wk-on-dmd", + Name: "ng-cortex-wk-preemp", Config: &containerpb.NodeConfig{ MachineType: *clusterConfig.InstanceType, Labels: nodeLabels, @@ -492,18 +476,15 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli "https://www.googleapis.com/auth/compute", "https://www.googleapis.com/auth/devstorage.read_only", }, + ServiceAccount: gcpClient.ClientEmail, + Preemptible: true, }, - InitialNodeCount: int32(onDemandInitialNodeCount), + InitialNodeCount: int32(initialNodeCount), }) } - - if onDemandInitialNodeCount > 0 && initialNodeCount-onDemandInitialNodeCount > 0 { - initialNodeCount -= onDemandInitialNodeCount - } - - if clusterConfig.Preemptible != nil && *clusterConfig.Preemptible { + if clusterConfig.OnDemandBackup || !clusterConfig.Preemptible { gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{ - Name: "ng-cortex-wk-preemp", + Name: "ng-cortex-wk-on-dmd", Config: &containerpb.NodeConfig{ MachineType: *clusterConfig.InstanceType, Labels: nodeLabels, @@ -519,7 +500,6 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli "https://www.googleapis.com/auth/compute", "https://www.googleapis.com/auth/devstorage.read_only", }, - ServiceAccount: gcpClient.ClientEmail, }, InitialNodeCount: int32(initialNodeCount), }) diff --git a/manager/manifests/cluster-autoscaler.yaml.j2 b/manager/manifests/cluster-autoscaler.yaml.j2 index b69b85af5d..1d2d7ea4f4 100644 --- a/manager/manifests/cluster-autoscaler.yaml.j2 +++ b/manager/manifests/cluster-autoscaler.yaml.j2 @@ -131,7 +131,7 @@ subjects: name: cluster-autoscaler namespace: kube-system --- -{% if (config.get('spot_config') and config['spot_config'].get('on_demand_backup', false)) or (config.get('preemptible_config') and config['preemptible_config'].get('on_demand_backup', false)) %} +{% if (config.get('spot_config') and config['spot_config'].get('on_demand_backup', false)) or config.get('on_demand_backup') %} apiVersion: v1 kind: ConfigMap metadata: @@ -144,7 +144,7 @@ data: - .*ng-cortex-worker-on-demand.* 50: - .*ng-cortex-worker-spot.* - {% elif config.get('preemptible_config') %} + {% else %} 10: - .*ng-cortex-wk-on-dmd.* 50: @@ -192,7 +192,7 @@ spec: - --cloud-provider=gce {% endif %} - --skip-nodes-with-local-storage=false - {% if (config.get('spot_config') and config['spot_config'].get('on_demand_backup', false)) or (config.get('preemptible_config') and config['preemptible_config'].get('on_demand_backup', false)) %} + {% if (config.get('spot_config') and config['spot_config'].get('on_demand_backup', false)) or config.get('on_demand_backup') %} - --expander=priority {% else %} - --expander=least-waste diff --git a/pkg/types/clusterconfig/cluster_config_gcp.go b/pkg/types/clusterconfig/cluster_config_gcp.go index 67a10288d9..c7074705f7 100644 --- a/pkg/types/clusterconfig/cluster_config_gcp.go +++ b/pkg/types/clusterconfig/cluster_config_gcp.go @@ -43,8 +43,8 @@ type GCPConfig struct { Subnet *string `json:"subnet" yaml:"subnet"` MinInstances *int64 `json:"min_instances" yaml:"min_instances"` MaxInstances *int64 `json:"max_instances" yaml:"max_instances"` - Preemptible *bool `json:"preemptible" yaml:"preemptible"` - PreemptibleConfig *PreemptibleConfig `json:"preemptible_config" yaml:"preemptible_config"` + Preemptible bool `json:"preemptible" yaml:"preemptible"` + OnDemandBackup bool `json:"on_demand_backup" yaml:"on_demand_backup"` ClusterName string `json:"cluster_name" yaml:"cluster_name"` Telemetry bool `json:"telemetry" yaml:"telemetry"` ImageOperator string `json:"image_operator" yaml:"image_operator"` @@ -57,11 +57,6 @@ type GCPConfig struct { ImageGooglePause string `json:"image_google_pause" yaml:"image_google_pause"` } -type PreemptibleConfig struct { - OnDemandBaseCapacity *int64 `json:"on_demand_base_capacity" yaml:"on_demand_base_capacity"` - OnDemandBackup *bool `json:"on_demand_backup" yaml:"on_demand_backup"` -} - type InternalGCPConfig struct { GCPConfig @@ -180,30 +175,14 @@ var UserGCPValidation = &cr.StructValidation{ }, { StructField: "Preemptible", - BoolPtrValidation: &cr.BoolPtrValidation{ - Default: pointer.Bool(false), + BoolValidation: &cr.BoolValidation{ + Default: false, }, }, { - StructField: "PreemptibleConfig", - StructValidation: &cr.StructValidation{ - DefaultNil: true, - AllowExplicitNull: true, - StructFieldValidations: []*cr.StructFieldValidation{ - { - StructField: "OnDemandBaseCapacity", - Int64PtrValidation: &cr.Int64PtrValidation{ - GreaterThanOrEqualTo: pointer.Int64(0), - AllowExplicitNull: true, - }, - }, - { - StructField: "OnDemandBackup", - BoolPtrValidation: &cr.BoolPtrValidation{ - Default: pointer.Bool(true), - }, - }, - }, + StructField: "OnDemandBackup", + BoolValidation: &cr.BoolValidation{ + Default: false, }, }, { @@ -387,24 +366,8 @@ func (cc *GCPConfig) Validate(GCP *gcp.Client) error { } } - if cc.Preemptible != nil && *cc.Preemptible { - if cc.PreemptibleConfig == nil { - cc.PreemptibleConfig = &PreemptibleConfig{} - } - if cc.PreemptibleConfig.OnDemandBaseCapacity == nil { - cc.PreemptibleConfig.OnDemandBaseCapacity = pointer.Int64(0) - } - if cc.PreemptibleConfig.OnDemandBackup == nil { - cc.PreemptibleConfig.OnDemandBackup = pointer.Bool(true) - } - if *cc.PreemptibleConfig.OnDemandBaseCapacity > *cc.MaxInstances { - return ErrorOnDemandBaseCapacityGreaterThanMax(**&cc.PreemptibleConfig.OnDemandBaseCapacity, *cc.MaxInstances) - } - - } else { - if cc.PreemptibleConfig != nil { - return ErrorGCPConfiguredWhenPreemptibleIsNotEnabled(PreemptibleConfigKey) - } + if !cc.Preemptible && cc.OnDemandBackup { + return ErrorGCPConfiguredWhenPreemptibleIsNotEnabled(PreemptibleConfigKey) } return nil @@ -559,13 +522,8 @@ func (cc *GCPConfig) UserTable() table.KeyValuePairs { if cc.AcceleratorType != nil { items.Add(AcceleratorTypeUserKey, *cc.AcceleratorType) } - if cc.Preemptible != nil { - items.Add(PreemptibleUserKey, s.YesNo(*cc.Preemptible)) - } - if cc.Preemptible != nil && *cc.Preemptible { - items.Add(OnDemandBaseCapacityUserKey, *cc.PreemptibleConfig.OnDemandBaseCapacity) - items.Add(OnDemandBackupUserKey, s.YesNo(*cc.PreemptibleConfig.OnDemandBackup)) - } + items.Add(PreemptibleUserKey, s.YesNo(cc.Preemptible)) + items.Add(OnDemandBackupUserKey, s.YesNo(cc.OnDemandBackup)) if cc.Network != nil { items.Add(NetworkUserKey, *cc.Network) } @@ -619,19 +577,8 @@ func (cc *GCPConfig) TelemetryEvent() map[string]interface{} { if cc.ClusterName != "cortex" { event["cluster_name._is_custom"] = true } - if cc.Preemptible != nil { - event["preemptible._is_defined"] = true - event["preemptible"] = *cc.Preemptible - } - if cc.PreemptibleConfig != nil { - event["preemptible_config._is_defined"] = true - if cc.PreemptibleConfig.OnDemandBaseCapacity != nil { - event["preemptible_config.on_demand_base_capacity"] = *cc.PreemptibleConfig.OnDemandBaseCapacity - } - if cc.PreemptibleConfig.OnDemandBackup != nil { - event["preemptible_config.on_demand_backup"] = *cc.PreemptibleConfig.OnDemandBackup - } - } + event["preemptible"] = cc.Preemptible + event["on_demand_backup"] = cc.OnDemandBackup if cc.Zone != nil { event["zone._is_defined"] = true event["zone"] = *cc.Zone From 90b31d06b145f67461ae9f241c44d484d90f70ea Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 16 Jan 2021 02:12:57 +0200 Subject: [PATCH 03/11] Fixes --- cli/cmd/cluster_gcp.go | 4 +-- pkg/types/clusterconfig/cluster_config_aws.go | 4 +-- pkg/types/clusterconfig/cluster_config_gcp.go | 34 +++++++++++-------- pkg/types/clusterconfig/config_key.go | 3 +- pkg/types/clusterconfig/errors.go | 20 +++++------ 5 files changed, 34 insertions(+), 31 deletions(-) diff --git a/cli/cmd/cluster_gcp.go b/cli/cmd/cluster_gcp.go index 7370086bc5..b50dc2080f 100644 --- a/cli/cmd/cluster_gcp.go +++ b/cli/cmd/cluster_gcp.go @@ -458,7 +458,7 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli Locations: []string{*clusterConfig.Zone}, } - if clusterConfig.Preemptible { + if *clusterConfig.Preemptible { gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{ Name: "ng-cortex-wk-preemp", Config: &containerpb.NodeConfig{ @@ -482,7 +482,7 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli InitialNodeCount: int32(initialNodeCount), }) } - if clusterConfig.OnDemandBackup || !clusterConfig.Preemptible { + if *clusterConfig.OnDemandBackup || !*clusterConfig.Preemptible { gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{ Name: "ng-cortex-wk-on-dmd", Config: &containerpb.NodeConfig{ diff --git a/pkg/types/clusterconfig/cluster_config_aws.go b/pkg/types/clusterconfig/cluster_config_aws.go index b3ce6e8a22..e9625dde32 100644 --- a/pkg/types/clusterconfig/cluster_config_aws.go +++ b/pkg/types/clusterconfig/cluster_config_aws.go @@ -1154,9 +1154,7 @@ func (cc *Config) UserTable() table.KeyValuePairs { items.Add(InstanceVolumeSizeUserKey, cc.InstanceVolumeSize) items.Add(InstanceVolumeTypeUserKey, cc.InstanceVolumeType) items.Add(InstanceVolumeIOPSUserKey, cc.InstanceVolumeIOPS) - if cc.Spot != nil { - items.Add(SpotUserKey, s.YesNo(*cc.Spot)) - } + items.Add(SpotUserKey, s.YesNo(*cc.Spot)) if cc.Spot != nil && *cc.Spot { items.Add(InstanceDistributionUserKey, cc.SpotConfig.InstanceDistribution) items.Add(OnDemandBaseCapacityUserKey, *cc.SpotConfig.OnDemandBaseCapacity) diff --git a/pkg/types/clusterconfig/cluster_config_gcp.go b/pkg/types/clusterconfig/cluster_config_gcp.go index c7074705f7..8bd0090494 100644 --- a/pkg/types/clusterconfig/cluster_config_gcp.go +++ b/pkg/types/clusterconfig/cluster_config_gcp.go @@ -43,8 +43,8 @@ type GCPConfig struct { Subnet *string `json:"subnet" yaml:"subnet"` MinInstances *int64 `json:"min_instances" yaml:"min_instances"` MaxInstances *int64 `json:"max_instances" yaml:"max_instances"` - Preemptible bool `json:"preemptible" yaml:"preemptible"` - OnDemandBackup bool `json:"on_demand_backup" yaml:"on_demand_backup"` + Preemptible *bool `json:"preemptible" yaml:"preemptible"` + OnDemandBackup *bool `json:"on_demand_backup" yaml:"on_demand_backup"` ClusterName string `json:"cluster_name" yaml:"cluster_name"` Telemetry bool `json:"telemetry" yaml:"telemetry"` ImageOperator string `json:"image_operator" yaml:"image_operator"` @@ -175,15 +175,13 @@ var UserGCPValidation = &cr.StructValidation{ }, { StructField: "Preemptible", - BoolValidation: &cr.BoolValidation{ - Default: false, + BoolPtrValidation: &cr.BoolPtrValidation{ + Default: pointer.Bool(false), }, }, { - StructField: "OnDemandBackup", - BoolValidation: &cr.BoolValidation{ - Default: false, - }, + StructField: "OnDemandBackup", + BoolPtrValidation: &cr.BoolPtrValidation{}, }, { StructField: "Project", @@ -366,8 +364,8 @@ func (cc *GCPConfig) Validate(GCP *gcp.Client) error { } } - if !cc.Preemptible && cc.OnDemandBackup { - return ErrorGCPConfiguredWhenPreemptibleIsNotEnabled(PreemptibleConfigKey) + if !*cc.Preemptible && *cc.OnDemandBackup { + return ErrorGCPOnDemandEnabledWhenPreemptibleIsNotEnabled(OnDemandBackupKey) } return nil @@ -473,6 +471,14 @@ func SetGCPDefaults(cc *GCPConfig) error { if errors.HasError(errs) { return errors.FirstError(errs...) } + + if *cc.Preemptible && cc.OnDemandBackup == nil { + cc.OnDemandBackup = pointer.Bool(true) + } + if !*cc.Preemptible { + cc.OnDemandBackup = pointer.Bool(false) + } + return nil } @@ -522,8 +528,8 @@ func (cc *GCPConfig) UserTable() table.KeyValuePairs { if cc.AcceleratorType != nil { items.Add(AcceleratorTypeUserKey, *cc.AcceleratorType) } - items.Add(PreemptibleUserKey, s.YesNo(cc.Preemptible)) - items.Add(OnDemandBackupUserKey, s.YesNo(cc.OnDemandBackup)) + items.Add(PreemptibleUserKey, s.YesNo(*cc.Preemptible)) + items.Add(OnDemandBackupUserKey, s.YesNo(*cc.OnDemandBackup)) if cc.Network != nil { items.Add(NetworkUserKey, *cc.Network) } @@ -577,8 +583,8 @@ func (cc *GCPConfig) TelemetryEvent() map[string]interface{} { if cc.ClusterName != "cortex" { event["cluster_name._is_custom"] = true } - event["preemptible"] = cc.Preemptible - event["on_demand_backup"] = cc.OnDemandBackup + event["preemptible"] = *cc.Preemptible + event["on_demand_backup"] = *cc.OnDemandBackup if cc.Zone != nil { event["zone._is_defined"] = true event["zone"] = *cc.Zone diff --git a/pkg/types/clusterconfig/config_key.go b/pkg/types/clusterconfig/config_key.go index 324d8e6444..0fa7ebb9a4 100644 --- a/pkg/types/clusterconfig/config_key.go +++ b/pkg/types/clusterconfig/config_key.go @@ -31,7 +31,6 @@ const ( SpotKey = "spot" SpotConfigKey = "spot_config" PreemptibleKey = "preemptible" - PreemptibleConfigKey = "preemptible_config" InstanceDistributionKey = "instance_distribution" OnDemandBaseCapacityKey = "on_demand_base_capacity" OnDemandPercentageAboveBaseCapacityKey = "on_demand_percentage_above_base_capacity" @@ -83,7 +82,7 @@ const ( SSLCertificateARNUserKey = "ssl certificate arn" BucketUserKey = "s3 bucket" SpotUserKey = "use spot instances" - PreemptibleUserKey = "user preemptible instances" + PreemptibleUserKey = "use preemptible instances" InstanceTypeUserKey = "instance type" AcceleratorTypeUserKey = "accelerator type" NetworkUserKey = "network" diff --git a/pkg/types/clusterconfig/errors.go b/pkg/types/clusterconfig/errors.go index 10c9fabbf3..5db858a774 100644 --- a/pkg/types/clusterconfig/errors.go +++ b/pkg/types/clusterconfig/errors.go @@ -61,13 +61,13 @@ const ( ErrSSLCertificateARNNotFound = "clusterconfig.ssl_certificate_arn_not_found" ErrProviderMismatch = "clusterconfig.provider_mismatch" - ErrGCPInvalidProjectID = "clusterconfig.gcp_invalid_project_id" - ErrGCPProjectMustBeSpecified = "clusterconfig.gcp_project_must_be_specified" - ErrGCPInvalidZone = "clusterconfig.gcp_invalid_zone" - ErrGCPInvalidInstanceType = "clusterconfig.gcp_invalid_instance_type" - ErrGCPInvalidAcceleratorType = "clusterconfig.gcp_invalid_accelerator_type" - ErrGCPIncompatibleInstanceTypeWithAccelerator = "clusterconfig.gcp_incompatible_instance_type_with_accelerator" - ErrGCPConfiguredWhenPreemptibleIsNotEnabled = "clusterconfig.gcp_configured_when_preemptible_is_not_enabled" + ErrGCPInvalidProjectID = "clusterconfig.gcp_invalid_project_id" + ErrGCPProjectMustBeSpecified = "clusterconfig.gcp_project_must_be_specified" + ErrGCPInvalidZone = "clusterconfig.gcp_invalid_zone" + ErrGCPInvalidInstanceType = "clusterconfig.gcp_invalid_instance_type" + ErrGCPInvalidAcceleratorType = "clusterconfig.gcp_invalid_accelerator_type" + ErrGCPIncompatibleInstanceTypeWithAccelerator = "clusterconfig.gcp_incompatible_instance_type_with_accelerator" + ErrGCPOnDemandEnabledWhenPreemptibleIsNotEnabled = "clusterconfig.gcp_on_demand_enabled_when_preemptible_is_not_enabled" ) func ErrorInvalidRegion(region string) error { @@ -369,9 +369,9 @@ func ErrorGCPIncompatibleInstanceTypeWithAccelerator(instanceType, acceleratorTy }) } -func ErrorGCPConfiguredWhenPreemptibleIsNotEnabled(configKey string) error { +func ErrorGCPOnDemandEnabledWhenPreemptibleIsNotEnabled(configKey string) error { return errors.WithStack(&errors.Error{ - Kind: ErrGCPConfiguredWhenPreemptibleIsNotEnabled, - Message: fmt.Sprintf("%s cannot be specified unless preemptible is enabled (to enable preemptible instances, set `%s: true` in your cluster configuration file)", configKey, PreemptibleKey), + Kind: ErrGCPOnDemandEnabledWhenPreemptibleIsNotEnabled, + Message: fmt.Sprintf("%s cannot be enabled unless preemptible is enabled (to enable preemptible instances, set `%s: true` in your cluster configuration file)", configKey, PreemptibleKey), }) } From 58405ec3fd59330423f58aa32aee904e5377926b Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 16 Jan 2021 03:03:12 +0200 Subject: [PATCH 04/11] Panic fix --- cli/cmd/lib_cluster_config_gcp.go | 7 +++++++ pkg/types/clusterconfig/cluster_config_gcp.go | 7 ------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cli/cmd/lib_cluster_config_gcp.go b/cli/cmd/lib_cluster_config_gcp.go index e8c5a63786..0d045b8a18 100644 --- a/cli/cmd/lib_cluster_config_gcp.go +++ b/cli/cmd/lib_cluster_config_gcp.go @@ -180,6 +180,13 @@ func getGCPInstallClusterConfig(gcpClient *gcp.Client, accessConfig clusterconfi } } + if clusterConfig.OnDemandBackup == nil { + if *clusterConfig.Preemptible { + clusterConfig.OnDemandBackup = pointer.Bool(true) + } else { + clusterConfig.OnDemandBackup = pointer.Bool(false) + } + } clusterConfig.ClusterName = *accessConfig.ClusterName clusterConfig.Zone = accessConfig.Zone clusterConfig.Project = accessConfig.Project diff --git a/pkg/types/clusterconfig/cluster_config_gcp.go b/pkg/types/clusterconfig/cluster_config_gcp.go index 8bd0090494..c07821b849 100644 --- a/pkg/types/clusterconfig/cluster_config_gcp.go +++ b/pkg/types/clusterconfig/cluster_config_gcp.go @@ -472,13 +472,6 @@ func SetGCPDefaults(cc *GCPConfig) error { return errors.FirstError(errs...) } - if *cc.Preemptible && cc.OnDemandBackup == nil { - cc.OnDemandBackup = pointer.Bool(true) - } - if !*cc.Preemptible { - cc.OnDemandBackup = pointer.Bool(false) - } - return nil } From a46ccbf5816320b4645d2dde98cc0947f4c572e0 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 16 Jan 2021 03:04:05 +0200 Subject: [PATCH 05/11] Remove CA options to down-scale immediately --- manager/manifests/cluster-autoscaler.yaml.j2 | 2 -- 1 file changed, 2 deletions(-) diff --git a/manager/manifests/cluster-autoscaler.yaml.j2 b/manager/manifests/cluster-autoscaler.yaml.j2 index 1d2d7ea4f4..360c8f9176 100644 --- a/manager/manifests/cluster-autoscaler.yaml.j2 +++ b/manager/manifests/cluster-autoscaler.yaml.j2 @@ -184,8 +184,6 @@ spec: - ./cluster-autoscaler - --v=4 - --stderrthreshold=info - - --scale-down-delay-after-add=10s - - --scale-down-unneeded-time=20s {% if config["provider"] == "aws" %} - --cloud-provider=aws {% else %} From 1c69b749c15e80baa14cbc18b4ce79f8745f178b Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 16 Jan 2021 03:09:27 +0200 Subject: [PATCH 06/11] Add spot reference to the docs --- docs/clusters/gcp/install.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/clusters/gcp/install.md b/docs/clusters/gcp/install.md index 6bcfed8153..2768fbf5da 100644 --- a/docs/clusters/gcp/install.md +++ b/docs/clusters/gcp/install.md @@ -40,6 +40,12 @@ max_instances: 5 # GPU to attach to your instance (optional) # accelerator_type: nvidia-tesla-t4 +# enable the use of preemptible instances +# preemptible: false + +# enable the use of on-demand backup instances when the capacity for preemptible instances runs out +# on_demand_backup: false + # the name of the network in which to create your cluster # network: default From 52c13c807ee17992399d0ef21faa715a5da9c559 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 16 Jan 2021 03:13:44 +0200 Subject: [PATCH 07/11] A few nits --- cli/cmd/cluster_gcp.go | 1 + pkg/types/clusterconfig/cluster_config_gcp.go | 2 +- pkg/types/clusterconfig/errors.go | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/cli/cmd/cluster_gcp.go b/cli/cmd/cluster_gcp.go index b50dc2080f..3d8e078059 100644 --- a/cli/cmd/cluster_gcp.go +++ b/cli/cmd/cluster_gcp.go @@ -500,6 +500,7 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli "https://www.googleapis.com/auth/compute", "https://www.googleapis.com/auth/devstorage.read_only", }, + ServiceAccount: gcpClient.ClientEmail, }, InitialNodeCount: int32(initialNodeCount), }) diff --git a/pkg/types/clusterconfig/cluster_config_gcp.go b/pkg/types/clusterconfig/cluster_config_gcp.go index c07821b849..abc92aa287 100644 --- a/pkg/types/clusterconfig/cluster_config_gcp.go +++ b/pkg/types/clusterconfig/cluster_config_gcp.go @@ -365,7 +365,7 @@ func (cc *GCPConfig) Validate(GCP *gcp.Client) error { } if !*cc.Preemptible && *cc.OnDemandBackup { - return ErrorGCPOnDemandEnabledWhenPreemptibleIsNotEnabled(OnDemandBackupKey) + return ErrorGCPOnDemandEnabledWhenPreemptibleIsNotEnabled() } return nil diff --git a/pkg/types/clusterconfig/errors.go b/pkg/types/clusterconfig/errors.go index 5db858a774..59c63e2da4 100644 --- a/pkg/types/clusterconfig/errors.go +++ b/pkg/types/clusterconfig/errors.go @@ -369,9 +369,9 @@ func ErrorGCPIncompatibleInstanceTypeWithAccelerator(instanceType, acceleratorTy }) } -func ErrorGCPOnDemandEnabledWhenPreemptibleIsNotEnabled(configKey string) error { +func ErrorGCPOnDemandEnabledWhenPreemptibleIsNotEnabled() error { return errors.WithStack(&errors.Error{ Kind: ErrGCPOnDemandEnabledWhenPreemptibleIsNotEnabled, - Message: fmt.Sprintf("%s cannot be enabled unless preemptible is enabled (to enable preemptible instances, set `%s: true` in your cluster configuration file)", configKey, PreemptibleKey), + Message: fmt.Sprintf("%s cannot be enabled unless preemptible is enabled (to enable preemptible instances, set `%s: true` in your cluster configuration file)", OnDemandBackupKey, PreemptibleKey), }) } From 9dc78d5157165f3376f7ed55e97f497dd1800fb9 Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Sat, 16 Jan 2021 03:31:20 +0200 Subject: [PATCH 08/11] Better error function --- pkg/types/clusterconfig/cluster_config_gcp.go | 2 +- pkg/types/clusterconfig/errors.go | 28 +++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/pkg/types/clusterconfig/cluster_config_gcp.go b/pkg/types/clusterconfig/cluster_config_gcp.go index abc92aa287..ea51b52da3 100644 --- a/pkg/types/clusterconfig/cluster_config_gcp.go +++ b/pkg/types/clusterconfig/cluster_config_gcp.go @@ -365,7 +365,7 @@ func (cc *GCPConfig) Validate(GCP *gcp.Client) error { } if !*cc.Preemptible && *cc.OnDemandBackup { - return ErrorGCPOnDemandEnabledWhenPreemptibleIsNotEnabled() + return ErrorFieldConfigurationDependentOnCondition(OnDemandBackupKey, s.Bool(*cc.OnDemandBackup), PreemptibleKey, s.Bool(*cc.Preemptible)) } return nil diff --git a/pkg/types/clusterconfig/errors.go b/pkg/types/clusterconfig/errors.go index 59c63e2da4..233ef30985 100644 --- a/pkg/types/clusterconfig/errors.go +++ b/pkg/types/clusterconfig/errors.go @@ -51,6 +51,7 @@ const ( ErrNotEnoughValidDefaultAvailibilityZones = "clusterconfig.not_enough_valid_default_availability_zones" ErrNoNATGatewayWithSubnets = "clusterconfig.no_nat_gateway_with_subnets" ErrSpecifyOneOrNone = "clusterconfig.specify_one_or_none" + ErrFieldConfigurationDependentOnCondition = "clusterconfig.field_configuration_dependent_on_condition" ErrDidNotMatchStrictS3Regex = "clusterconfig.did_not_match_strict_s3_regex" ErrNATRequiredWithPrivateSubnetVisibility = "clusterconfig.nat_required_with_private_subnet_visibility" ErrS3RegionDiffersFromCluster = "clusterconfig.s3_region_differs_from_cluster" @@ -61,13 +62,12 @@ const ( ErrSSLCertificateARNNotFound = "clusterconfig.ssl_certificate_arn_not_found" ErrProviderMismatch = "clusterconfig.provider_mismatch" - ErrGCPInvalidProjectID = "clusterconfig.gcp_invalid_project_id" - ErrGCPProjectMustBeSpecified = "clusterconfig.gcp_project_must_be_specified" - ErrGCPInvalidZone = "clusterconfig.gcp_invalid_zone" - ErrGCPInvalidInstanceType = "clusterconfig.gcp_invalid_instance_type" - ErrGCPInvalidAcceleratorType = "clusterconfig.gcp_invalid_accelerator_type" - ErrGCPIncompatibleInstanceTypeWithAccelerator = "clusterconfig.gcp_incompatible_instance_type_with_accelerator" - ErrGCPOnDemandEnabledWhenPreemptibleIsNotEnabled = "clusterconfig.gcp_on_demand_enabled_when_preemptible_is_not_enabled" + ErrGCPInvalidProjectID = "clusterconfig.gcp_invalid_project_id" + ErrGCPProjectMustBeSpecified = "clusterconfig.gcp_project_must_be_specified" + ErrGCPInvalidZone = "clusterconfig.gcp_invalid_zone" + ErrGCPInvalidInstanceType = "clusterconfig.gcp_invalid_instance_type" + ErrGCPInvalidAcceleratorType = "clusterconfig.gcp_invalid_accelerator_type" + ErrGCPIncompatibleInstanceTypeWithAccelerator = "clusterconfig.gcp_incompatible_instance_type_with_accelerator" ) func ErrorInvalidRegion(region string) error { @@ -243,6 +243,13 @@ func ErrorSpecifyOneOrNone(fieldName1 string, fieldName2 string, fieldNames ...s }) } +func ErrorFieldConfigurationDependentOnCondition(configuredField string, configuredFieldValue string, dependencyField string, dependencyFieldValue string) error { + return errors.WithStack(&errors.Error{ + Kind: ErrFieldConfigurationDependentOnCondition, + Message: fmt.Sprintf("cannot set %s = %s when %s = %s", configuredField, configuredFieldValue, dependencyField, dependencyFieldValue), + }) +} + func ErrorDidNotMatchStrictS3Regex() error { return errors.WithStack(&errors.Error{ Kind: ErrDidNotMatchStrictS3Regex, @@ -368,10 +375,3 @@ func ErrorGCPIncompatibleInstanceTypeWithAccelerator(instanceType, acceleratorTy Message: fmt.Sprintf("instance type %s is incompatible with the %s accelerator; the following instance types are compatible with the %s accelerator in zone %s: %s", instanceType, acceleratorType, acceleratorType, zone, s.StrsOr(compatibleInstances)), }) } - -func ErrorGCPOnDemandEnabledWhenPreemptibleIsNotEnabled() error { - return errors.WithStack(&errors.Error{ - Kind: ErrGCPOnDemandEnabledWhenPreemptibleIsNotEnabled, - Message: fmt.Sprintf("%s cannot be enabled unless preemptible is enabled (to enable preemptible instances, set `%s: true` in your cluster configuration file)", OnDemandBackupKey, PreemptibleKey), - }) -} From 760d4736a6e5c0cbe1e3f35e1d82b8a21ae0dc5a Mon Sep 17 00:00:00 2001 From: Robert Lucian Chiriac Date: Mon, 18 Jan 2021 21:41:23 +0200 Subject: [PATCH 09/11] Address PR comments --- build/images.sh | 2 +- cli/cmd/cluster_gcp.go | 4 +-- cli/cmd/lib_cluster_config_gcp.go | 7 ----- pkg/types/clusterconfig/cluster_config_gcp.go | 28 +++++++++++-------- 4 files changed, 19 insertions(+), 22 deletions(-) diff --git a/build/images.sh b/build/images.sh index 20b36d18c9..adf1caf142 100644 --- a/build/images.sh +++ b/build/images.sh @@ -64,6 +64,7 @@ dev_images_gcp=( non_dev_images_cluster=( "tensorflow-serving-cpu" "tensorflow-serving-gpu" + "cluster-autoscaler" "operator" "istio-proxy" "istio-pilot" @@ -72,7 +73,6 @@ non_dev_images_cluster=( non_dev_images_aws=( # includes non_dev_images_cluster "tensorflow-serving-inf" - "cluster-autoscaler" "metrics-server" "inferentia" "neuron-rtd" diff --git a/cli/cmd/cluster_gcp.go b/cli/cmd/cluster_gcp.go index 3d8e078059..d5ad0bbdbe 100644 --- a/cli/cmd/cluster_gcp.go +++ b/cli/cmd/cluster_gcp.go @@ -458,7 +458,7 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli Locations: []string{*clusterConfig.Zone}, } - if *clusterConfig.Preemptible { + if clusterConfig.Preemptible { gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{ Name: "ng-cortex-wk-preemp", Config: &containerpb.NodeConfig{ @@ -482,7 +482,7 @@ func createGKECluster(clusterConfig *clusterconfig.GCPConfig, gcpClient *gcp.Cli InitialNodeCount: int32(initialNodeCount), }) } - if *clusterConfig.OnDemandBackup || !*clusterConfig.Preemptible { + if clusterConfig.OnDemandBackup || !clusterConfig.Preemptible { gkeClusterConfig.NodePools = append(gkeClusterConfig.NodePools, &containerpb.NodePool{ Name: "ng-cortex-wk-on-dmd", Config: &containerpb.NodeConfig{ diff --git a/cli/cmd/lib_cluster_config_gcp.go b/cli/cmd/lib_cluster_config_gcp.go index 0d045b8a18..e8c5a63786 100644 --- a/cli/cmd/lib_cluster_config_gcp.go +++ b/cli/cmd/lib_cluster_config_gcp.go @@ -180,13 +180,6 @@ func getGCPInstallClusterConfig(gcpClient *gcp.Client, accessConfig clusterconfi } } - if clusterConfig.OnDemandBackup == nil { - if *clusterConfig.Preemptible { - clusterConfig.OnDemandBackup = pointer.Bool(true) - } else { - clusterConfig.OnDemandBackup = pointer.Bool(false) - } - } clusterConfig.ClusterName = *accessConfig.ClusterName clusterConfig.Zone = accessConfig.Zone clusterConfig.Project = accessConfig.Project diff --git a/pkg/types/clusterconfig/cluster_config_gcp.go b/pkg/types/clusterconfig/cluster_config_gcp.go index ea51b52da3..2b25df5b43 100644 --- a/pkg/types/clusterconfig/cluster_config_gcp.go +++ b/pkg/types/clusterconfig/cluster_config_gcp.go @@ -43,8 +43,8 @@ type GCPConfig struct { Subnet *string `json:"subnet" yaml:"subnet"` MinInstances *int64 `json:"min_instances" yaml:"min_instances"` MaxInstances *int64 `json:"max_instances" yaml:"max_instances"` - Preemptible *bool `json:"preemptible" yaml:"preemptible"` - OnDemandBackup *bool `json:"on_demand_backup" yaml:"on_demand_backup"` + Preemptible bool `json:"preemptible" yaml:"preemptible"` + OnDemandBackup bool `json:"on_demand_backup" yaml:"on_demand_backup"` ClusterName string `json:"cluster_name" yaml:"cluster_name"` Telemetry bool `json:"telemetry" yaml:"telemetry"` ImageOperator string `json:"image_operator" yaml:"image_operator"` @@ -175,13 +175,17 @@ var UserGCPValidation = &cr.StructValidation{ }, { StructField: "Preemptible", - BoolPtrValidation: &cr.BoolPtrValidation{ - Default: pointer.Bool(false), + BoolValidation: &cr.BoolValidation{ + Default: false, }, }, { - StructField: "OnDemandBackup", - BoolPtrValidation: &cr.BoolPtrValidation{}, + StructField: "OnDemandBackup", + DefaultDependentFields: []string{"Preemptible"}, + DefaultDependentFieldsFunc: func(vals []interface{}) interface{} { + return vals[0].(bool) + }, + BoolValidation: &cr.BoolValidation{}, }, { StructField: "Project", @@ -364,8 +368,8 @@ func (cc *GCPConfig) Validate(GCP *gcp.Client) error { } } - if !*cc.Preemptible && *cc.OnDemandBackup { - return ErrorFieldConfigurationDependentOnCondition(OnDemandBackupKey, s.Bool(*cc.OnDemandBackup), PreemptibleKey, s.Bool(*cc.Preemptible)) + if !cc.Preemptible && cc.OnDemandBackup { + return ErrorFieldConfigurationDependentOnCondition(OnDemandBackupKey, s.Bool(cc.OnDemandBackup), PreemptibleKey, s.Bool(cc.Preemptible)) } return nil @@ -521,8 +525,8 @@ func (cc *GCPConfig) UserTable() table.KeyValuePairs { if cc.AcceleratorType != nil { items.Add(AcceleratorTypeUserKey, *cc.AcceleratorType) } - items.Add(PreemptibleUserKey, s.YesNo(*cc.Preemptible)) - items.Add(OnDemandBackupUserKey, s.YesNo(*cc.OnDemandBackup)) + items.Add(PreemptibleUserKey, s.YesNo(cc.Preemptible)) + items.Add(OnDemandBackupUserKey, s.YesNo(cc.OnDemandBackup)) if cc.Network != nil { items.Add(NetworkUserKey, *cc.Network) } @@ -576,8 +580,8 @@ func (cc *GCPConfig) TelemetryEvent() map[string]interface{} { if cc.ClusterName != "cortex" { event["cluster_name._is_custom"] = true } - event["preemptible"] = *cc.Preemptible - event["on_demand_backup"] = *cc.OnDemandBackup + event["preemptible"] = cc.Preemptible + event["on_demand_backup"] = cc.OnDemandBackup if cc.Zone != nil { event["zone._is_defined"] = true event["zone"] = *cc.Zone From b906ebd700c19a0d3a7ec1e5952f2f154ba74678 Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Mon, 18 Jan 2021 12:54:28 -0800 Subject: [PATCH 10/11] Update install.md --- docs/clusters/gcp/install.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/clusters/gcp/install.md b/docs/clusters/gcp/install.md index bf99dc749d..cc01b0666e 100644 --- a/docs/clusters/gcp/install.md +++ b/docs/clusters/gcp/install.md @@ -37,18 +37,18 @@ min_instances: 1 # maximum number of instances max_instances: 5 +# enable the use of preemptible instances +preemptible: false + +# enable the use of on-demand backup instances which will be used when preemptible capacity runs out +on_demand_backup: false + # GPU to attach to your instance (optional) # accelerator_type: nvidia-tesla-t4 # the number of GPUs to attach to each instance (optional) # accelerators_per_instance: 1 -# enable the use of preemptible instances -# preemptible: false - -# enable the use of on-demand backup instances when the capacity for preemptible instances runs out -# on_demand_backup: false - # the name of the network in which to create your cluster # network: default From f7d5b5592eb52370bc9eb69475aade01b4289d64 Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Mon, 18 Jan 2021 12:57:11 -0800 Subject: [PATCH 11/11] Update install.md --- docs/clusters/gcp/install.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/clusters/gcp/install.md b/docs/clusters/gcp/install.md index cc01b0666e..227ea56b70 100644 --- a/docs/clusters/gcp/install.md +++ b/docs/clusters/gcp/install.md @@ -41,7 +41,8 @@ max_instances: 5 preemptible: false # enable the use of on-demand backup instances which will be used when preemptible capacity runs out -on_demand_backup: false +# default is true when preemptible instances are used +# on_demand_backup: true # GPU to attach to your instance (optional) # accelerator_type: nvidia-tesla-t4