diff --git a/manager/eks.yaml b/manager/eks.yaml index dfd18d9bd1..ca328379bc 100644 --- a/manager/eks.yaml +++ b/manager/eks.yaml @@ -32,3 +32,16 @@ nodeGroups: iam: withAddonPolicies: autoScaler: true + kubeletExtraConfig: + kubeReserved: + cpu: 150m + memory: 300Mi + ephemeral-storage: 1Gi + kubeReservedCgroup: /kube-reserved + systemReserved: + cpu: 150m + memory: 300Mi + ephemeral-storage: 1Gi + evictionHard: + memory.available: 200Mi + nodefs.available: 5% diff --git a/pkg/lib/k8s/pod.go b/pkg/lib/k8s/pod.go index bbd5fe9da3..af92ab440d 100644 --- a/pkg/lib/k8s/pod.go +++ b/pkg/lib/k8s/pod.go @@ -17,6 +17,7 @@ limitations under the License. package k8s import ( + "regexp" "time" kcore "k8s.io/api/core/v1" @@ -32,6 +33,8 @@ var podTypeMeta = kmeta.TypeMeta{ Kind: "Pod", } +const ReasonEvicted = "Evicted" + type PodStatus string const ( @@ -130,6 +133,8 @@ func GetPodReadyTime(pod *kcore.Pod) *time.Time { return nil } +var evictedMemoryMessageRegex = regexp.MustCompile(`(?i)low\W+on\W+resource\W+memory`) + func GetPodStatus(pod *kcore.Pod) PodStatus { if pod == nil { return PodStatusUnknown @@ -145,6 +150,10 @@ func GetPodStatus(pod *kcore.Pod) PodStatus { case kcore.PodSucceeded: return PodStatusSucceeded case kcore.PodFailed: + if pod.Status.Reason == ReasonEvicted && evictedMemoryMessageRegex.MatchString(pod.Status.Message) { + return PodStatusKilledOOM + } + for _, containerStatus := range pod.Status.ContainerStatuses { if containerStatus.LastTerminationState.Terminated != nil { exitCode := containerStatus.LastTerminationState.Terminated.ExitCode diff --git a/pkg/operator/workloads/cron.go b/pkg/operator/workloads/cron.go index acd2705d3e..051a304ea5 100644 --- a/pkg/operator/workloads/cron.go +++ b/pkg/operator/workloads/cron.go @@ -19,9 +19,12 @@ package workloads import ( "time" + kcore "k8s.io/api/core/v1" kmeta "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/cortexlabs/cortex/pkg/lib/errors" + "github.com/cortexlabs/cortex/pkg/lib/k8s" + "github.com/cortexlabs/cortex/pkg/lib/sets/strset" "github.com/cortexlabs/cortex/pkg/operator/config" ) @@ -60,6 +63,7 @@ func runCron() { "workloadType": workloadTypeAPI, "userFacing": "true", }) + if err != nil { config.Telemetry.ReportError(err) errors.PrintError(err) @@ -73,11 +77,14 @@ func runCron() { failedPods, err := config.Kubernetes.ListPods(&kmeta.ListOptions{ FieldSelector: "status.phase=Failed", }) + if err != nil { config.Telemetry.ReportError(err) errors.PrintError(err) } + deleteEvictedPods(failedPods) + if err := updateDataWorkloadErrors(failedPods); err != nil { config.Telemetry.ReportError(err) errors.PrintError(err) @@ -93,3 +100,33 @@ func reportAndRecover(strs ...string) error { } return nil } + +func deleteEvictedPods(failedPods []kcore.Pod) { + evictedPods := []kcore.Pod{} + for _, pod := range failedPods { + if pod.Status.Reason == k8s.ReasonEvicted { + evictedPods = append(evictedPods, pod) + } + } + + if len(evictedPods) > 0 { + savedEvictedPods := map[string]kcore.Pod{} + currentWorkloadIDs := strset.New() + for _, ctx := range CurrentContexts() { + currentWorkloadIDs.Merge(ctx.ComputedResourceWorkloadIDs()) + } + + for _, pod := range evictedPods { + if currentWorkloadIDs.Has(pod.Labels["workloadID"]) { + if _, ok := savedEvictedPods[pod.Labels["resourceID"]]; !ok { + savedEvictedPods[pod.Labels["resourceID"]] = pod + continue + } + } + _, err := config.Kubernetes.DeletePod(pod.Name) + if err != nil { + errors.PrintError(err) + } + } + } +}