Skip to content

Evict pods that consume too much memory #426

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Sep 5, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions manager/eks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,16 @@ nodeGroups:
iam:
withAddonPolicies:
autoScaler: true
kubeletExtraConfig:
kubeReserved:
cpu: 150m
memory: 300Mi
ephemeral-storage: 1Gi
kubeReservedCgroup: /kube-reserved
systemReserved:
cpu: 150m
memory: 300Mi
ephemeral-storage: 1Gi
evictionHard:
memory.available: 200Mi
nodefs.available: 5%
9 changes: 9 additions & 0 deletions pkg/lib/k8s/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package k8s

import (
"regexp"
"time"

kcore "k8s.io/api/core/v1"
Expand All @@ -32,6 +33,8 @@ var podTypeMeta = kmeta.TypeMeta{
Kind: "Pod",
}

const ReasonEvicted = "Evicted"

type PodStatus string

const (
Expand Down Expand Up @@ -130,6 +133,8 @@ func GetPodReadyTime(pod *kcore.Pod) *time.Time {
return nil
}

var evictedMemoryMessageRegex = regexp.MustCompile(`(?i)low\W+on\W+resource\W+memory`)

func GetPodStatus(pod *kcore.Pod) PodStatus {
if pod == nil {
return PodStatusUnknown
Expand All @@ -145,6 +150,10 @@ func GetPodStatus(pod *kcore.Pod) PodStatus {
case kcore.PodSucceeded:
return PodStatusSucceeded
case kcore.PodFailed:
if pod.Status.Reason == ReasonEvicted && evictedMemoryMessageRegex.MatchString(pod.Status.Message) {
return PodStatusKilledOOM
}

for _, containerStatus := range pod.Status.ContainerStatuses {
if containerStatus.LastTerminationState.Terminated != nil {
exitCode := containerStatus.LastTerminationState.Terminated.ExitCode
Expand Down
37 changes: 37 additions & 0 deletions pkg/operator/workloads/cron.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,12 @@ package workloads
import (
"time"

kcore "k8s.io/api/core/v1"
kmeta "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/cortexlabs/cortex/pkg/lib/errors"
"github.com/cortexlabs/cortex/pkg/lib/k8s"
"github.com/cortexlabs/cortex/pkg/lib/sets/strset"
"github.com/cortexlabs/cortex/pkg/operator/config"
)

Expand Down Expand Up @@ -60,6 +63,7 @@ func runCron() {
"workloadType": workloadTypeAPI,
"userFacing": "true",
})

if err != nil {
config.Telemetry.ReportError(err)
errors.PrintError(err)
Expand All @@ -73,11 +77,14 @@ func runCron() {
failedPods, err := config.Kubernetes.ListPods(&kmeta.ListOptions{
FieldSelector: "status.phase=Failed",
})

if err != nil {
config.Telemetry.ReportError(err)
errors.PrintError(err)
}

deleteEvictedPods(failedPods)

if err := updateDataWorkloadErrors(failedPods); err != nil {
config.Telemetry.ReportError(err)
errors.PrintError(err)
Expand All @@ -93,3 +100,33 @@ func reportAndRecover(strs ...string) error {
}
return nil
}

func deleteEvictedPods(failedPods []kcore.Pod) {
evictedPods := []kcore.Pod{}
for _, pod := range failedPods {
if pod.Status.Reason == k8s.ReasonEvicted {
evictedPods = append(evictedPods, pod)
}
}

if len(evictedPods) > 0 {
savedEvictedPods := map[string]kcore.Pod{}
currentWorkloadIDs := strset.New()
for _, ctx := range CurrentContexts() {
currentWorkloadIDs.Merge(ctx.ComputedResourceWorkloadIDs())
}

for _, pod := range evictedPods {
if currentWorkloadIDs.Has(pod.Labels["workloadID"]) {
if _, ok := savedEvictedPods[pod.Labels["resourceID"]]; !ok {
savedEvictedPods[pod.Labels["resourceID"]] = pod
continue
}
}
_, err := config.Kubernetes.DeletePod(pod.Name)
if err != nil {
errors.PrintError(err)
}
}
}
}