From 015ddfaa5f45153ec7f4a12b5f2234ae54772c58 Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Wed, 4 Sep 2019 17:16:48 -0700 Subject: [PATCH 1/8] Add kubelet config --- manager/eks.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/manager/eks.yaml b/manager/eks.yaml index bca7b54bec..11897677da 100644 --- a/manager/eks.yaml +++ b/manager/eks.yaml @@ -18,3 +18,16 @@ nodeGroups: iam: withAddonPolicies: autoScaler: true + kubeletExtraConfig: + kubeReserved: + cpu: 150m + memory: 300Mi + ephemeral-storage: 1Gi + # kubeReservedCgroup: /kube-reserved + systemReserved: + cpu: 150m + memory: 300Mi + ephemeral-storage: 1Gi + evictionHard: + memory.available: 200Mi + nodefs.available: 5% From 4c696fa74e79d0000916e1eabbb4666026260e89 Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Thu, 5 Sep 2019 09:23:42 -0700 Subject: [PATCH 2/8] Check for evicted pods --- pkg/lib/k8s/pod.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/lib/k8s/pod.go b/pkg/lib/k8s/pod.go index bbd5fe9da3..3a0ff42ee8 100644 --- a/pkg/lib/k8s/pod.go +++ b/pkg/lib/k8s/pod.go @@ -145,6 +145,10 @@ func GetPodStatus(pod *kcore.Pod) PodStatus { case kcore.PodSucceeded: return PodStatusSucceeded case kcore.PodFailed: + if pod.Status.Reason == "Evicted" { + return PodStatusKilledOOM + } + for _, containerStatus := range pod.Status.ContainerStatuses { if containerStatus.LastTerminationState.Terminated != nil { exitCode := containerStatus.LastTerminationState.Terminated.ExitCode From bbf701c9b5f809f75b3e67d5dd2cd3478f740fc9 Mon Sep 17 00:00:00 2001 From: David Eliahu Date: Thu, 5 Sep 2019 09:44:50 -0700 Subject: [PATCH 3/8] Add message regex --- pkg/lib/k8s/pod.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/lib/k8s/pod.go b/pkg/lib/k8s/pod.go index 3a0ff42ee8..af92ab440d 100644 --- a/pkg/lib/k8s/pod.go +++ b/pkg/lib/k8s/pod.go @@ -17,6 +17,7 @@ limitations under the License. package k8s import ( + "regexp" "time" kcore "k8s.io/api/core/v1" @@ -32,6 +33,8 @@ var podTypeMeta = kmeta.TypeMeta{ Kind: "Pod", } +const ReasonEvicted = "Evicted" + type PodStatus string const ( @@ -130,6 +133,8 @@ func GetPodReadyTime(pod *kcore.Pod) *time.Time { return nil } +var evictedMemoryMessageRegex = regexp.MustCompile(`(?i)low\W+on\W+resource\W+memory`) + func GetPodStatus(pod *kcore.Pod) PodStatus { if pod == nil { return PodStatusUnknown @@ -145,7 +150,7 @@ func GetPodStatus(pod *kcore.Pod) PodStatus { case kcore.PodSucceeded: return PodStatusSucceeded case kcore.PodFailed: - if pod.Status.Reason == "Evicted" { + if pod.Status.Reason == ReasonEvicted && evictedMemoryMessageRegex.MatchString(pod.Status.Message) { return PodStatusKilledOOM } From 6177d6eaf9a22efe4deca61e05bc65802da19eec Mon Sep 17 00:00:00 2001 From: vishal Date: Thu, 5 Sep 2019 15:04:02 -0400 Subject: [PATCH 4/8] Configure cron to cleanup evicted pods --- pkg/operator/workloads/cron.go | 44 ++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/pkg/operator/workloads/cron.go b/pkg/operator/workloads/cron.go index acd2705d3e..6cbfa7d109 100644 --- a/pkg/operator/workloads/cron.go +++ b/pkg/operator/workloads/cron.go @@ -19,9 +19,11 @@ package workloads import ( "time" + kcore "k8s.io/api/core/v1" kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/lib/sets/strset" "github.com/cortexlabs/cortex/pkg/operator/config" ) @@ -60,6 +62,7 @@ func runCron() { "workloadType": workloadTypeAPI, "userFacing": "true", }) + if err != nil { config.Telemetry.ReportError(err) errors.PrintError(err) @@ -73,11 +76,17 @@ func runCron() { failedPods, err := config.Kubernetes.ListPods(&kmeta.ListOptions{ FieldSelector: "status.phase=Failed", }) + if err != nil { config.Telemetry.ReportError(err) errors.PrintError(err) } + if err := deleteEvictedPods(failedPods); err != nil { + config.Telemetry.ReportError(err) + errors.PrintError(err) + } + if err := updateDataWorkloadErrors(failedPods); err != nil { config.Telemetry.ReportError(err) errors.PrintError(err) @@ -93,3 +102,38 @@ func reportAndRecover(strs ...string) error { } return nil } + +func deleteEvictedPods(failedPods []kcore.Pod) error { + evictedPods := []kcore.Pod{} + for _, pod := range failedPods { + if pod.Status.Reason == "Evicted" { + evictedPods = append(evictedPods, pod) + } + } + + if len(evictedPods) > 0 { + savedEvictedPods := map[string]kcore.Pod{} + currentWorkloadIDs := strset.New() + for _, ctx := range CurrentContexts() { + currentWorkloadIDs.Merge(ctx.ComputedResourceWorkloadIDs()) + } + + for _, pod := range evictedPods { + if currentWorkloadIDs.Has(pod.Labels["workloadID"]) { + if _, ok := savedEvictedPods[pod.Labels["resourceID"]]; !ok { + savedEvictedPods[pod.Labels["resourceID"]] = pod + continue + } + } + isSuccessful, err := config.Kubernetes.DeletePod(pod.Name) + if err != nil { + return err + } + if !isSuccessful { + return errors.New("failed to delete evicted pod " + pod.Name) + } + } + } + + return nil +} From 68eece4dfba7954e5751a641f28f07cb89e9d579 Mon Sep 17 00:00:00 2001 From: vishal Date: Thu, 5 Sep 2019 19:20:24 +0000 Subject: [PATCH 5/8] Do not write to telemetry if evicted pods fail to be deleted --- pkg/operator/workloads/cron.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/operator/workloads/cron.go b/pkg/operator/workloads/cron.go index 6cbfa7d109..560bb3fa66 100644 --- a/pkg/operator/workloads/cron.go +++ b/pkg/operator/workloads/cron.go @@ -83,7 +83,6 @@ func runCron() { } if err := deleteEvictedPods(failedPods); err != nil { - config.Telemetry.ReportError(err) errors.PrintError(err) } From ffaf7877445d83ef994ad58f02e5fa6f72db2bb0 Mon Sep 17 00:00:00 2001 From: vishal Date: Thu, 5 Sep 2019 19:26:28 +0000 Subject: [PATCH 6/8] Add kubeReservedCgroup back to eks config --- manager/eks.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/manager/eks.yaml b/manager/eks.yaml index ca8a14baa8..ca328379bc 100644 --- a/manager/eks.yaml +++ b/manager/eks.yaml @@ -37,7 +37,7 @@ nodeGroups: cpu: 150m memory: 300Mi ephemeral-storage: 1Gi - # kubeReservedCgroup: /kube-reserved + kubeReservedCgroup: /kube-reserved systemReserved: cpu: 150m memory: 300Mi From a72d4ee41cb193b4d4057f9d468f1e426f90f75b Mon Sep 17 00:00:00 2001 From: vishal Date: Thu, 5 Sep 2019 15:38:52 -0400 Subject: [PATCH 7/8] Use constant instead of string --- pkg/operator/workloads/cron.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pkg/operator/workloads/cron.go b/pkg/operator/workloads/cron.go index 560bb3fa66..2d6dccf7eb 100644 --- a/pkg/operator/workloads/cron.go +++ b/pkg/operator/workloads/cron.go @@ -23,6 +23,7 @@ import ( kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/lib/k8s" "github.com/cortexlabs/cortex/pkg/lib/sets/strset" "github.com/cortexlabs/cortex/pkg/operator/config" ) @@ -105,7 +106,7 @@ func reportAndRecover(strs ...string) error { func deleteEvictedPods(failedPods []kcore.Pod) error { evictedPods := []kcore.Pod{} for _, pod := range failedPods { - if pod.Status.Reason == "Evicted" { + if pod.Status.Reason == k8s.ReasonEvicted { evictedPods = append(evictedPods, pod) } } From 1847d481378791b4bf1d6eb51dd02a7b0368c784 Mon Sep 17 00:00:00 2001 From: vishal Date: Thu, 5 Sep 2019 15:47:52 -0400 Subject: [PATCH 8/8] Attempt to delete all evicted pods even if an error is encountered --- pkg/operator/workloads/cron.go | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/pkg/operator/workloads/cron.go b/pkg/operator/workloads/cron.go index 2d6dccf7eb..051a304ea5 100644 --- a/pkg/operator/workloads/cron.go +++ b/pkg/operator/workloads/cron.go @@ -83,9 +83,7 @@ func runCron() { errors.PrintError(err) } - if err := deleteEvictedPods(failedPods); err != nil { - errors.PrintError(err) - } + deleteEvictedPods(failedPods) if err := updateDataWorkloadErrors(failedPods); err != nil { config.Telemetry.ReportError(err) @@ -103,7 +101,7 @@ func reportAndRecover(strs ...string) error { return nil } -func deleteEvictedPods(failedPods []kcore.Pod) error { +func deleteEvictedPods(failedPods []kcore.Pod) { evictedPods := []kcore.Pod{} for _, pod := range failedPods { if pod.Status.Reason == k8s.ReasonEvicted { @@ -125,15 +123,10 @@ func deleteEvictedPods(failedPods []kcore.Pod) error { continue } } - isSuccessful, err := config.Kubernetes.DeletePod(pod.Name) + _, err := config.Kubernetes.DeletePod(pod.Name) if err != nil { - return err - } - if !isSuccessful { - return errors.New("failed to delete evicted pod " + pod.Name) + errors.PrintError(err) } } } - - return nil }