From d0108e7ea63e3177e140708d2ec05471e8617692 Mon Sep 17 00:00:00 2001 From: Ivan Zhang Date: Fri, 22 Mar 2019 09:58:29 -0400 Subject: [PATCH 1/4] OOM status --- pkg/api/resource/saved_status.go | 1 + pkg/api/resource/status.go | 14 +++++++++++--- pkg/operator/k8s/pod.go | 7 +++++++ pkg/operator/workloads/data_status.go | 2 ++ pkg/operator/workloads/workload_spec.go | 8 ++++++-- 5 files changed, 27 insertions(+), 5 deletions(-) diff --git a/pkg/api/resource/saved_status.go b/pkg/api/resource/saved_status.go index b39ccc97c9..34c4d4afb1 100644 --- a/pkg/api/resource/saved_status.go +++ b/pkg/api/resource/saved_status.go @@ -47,6 +47,7 @@ const ( ExitCodeDataSucceeded DataExitCode = "succeeded" ExitCodeDataFailed DataExitCode = "failed" ExitCodeDataKilled DataExitCode = "killed" + ExitCodeDataOOM DataExitCode = "oom" ) func DataSavedStatusPtrsEqual(savedStatus *DataSavedStatus, savedStatus2 *DataSavedStatus) bool { diff --git a/pkg/api/resource/status.go b/pkg/api/resource/status.go index 584a4fb47c..ae9aaa3e80 100644 --- a/pkg/api/resource/status.go +++ b/pkg/api/resource/status.go @@ -109,6 +109,8 @@ const ( StatusAPIGroupParentFailed StatusAPIGroupParentKilled StatusAPIGroupUpdateSkipped + + StatusDataOOM ) var statusCodes = []string{ @@ -136,9 +138,11 @@ var statusCodes = []string{ "status_api_group_parent_failed", "status_api_group_parent_killed", "status_api_group_update_skipped", + + "status_data_oom", } -var _ = [1]int{}[int(StatusAPIGroupUpdateSkipped)-(len(statusCodes)-1)] // Ensure list length matches +var _ = [1]int{}[int(StatusDataOOM)-(len(statusCodes)-1)] // Ensure list length matches var statusCodeMessages = []string{ "unknown", // StatusUnknown @@ -165,9 +169,11 @@ var statusCodeMessages = []string{ "upstream error", // StatusAPIGroupParentFailed "upstream termination", // StatusAPIGroupParentKilled "update skipped", // StatusAPIGroupUpdateSkipped + + "killed (oom)", // StatusDataOOM } -var _ = [1]int{}[int(StatusAPIGroupUpdateSkipped)-(len(statusCodeMessages)-1)] // Ensure list length matches +var _ = [1]int{}[int(StatusDataOOM)-(len(statusCodeMessages)-1)] // Ensure list length matches // StatusDataRunning aliases const ( @@ -203,9 +209,11 @@ var statusSortBuckets = []int{ 2, // StatusAPIGroupParentFailed 2, // StatusAPIGroupParentKilled 2, // StatusAPIGroupUpdateSkipped + + 1, // StatusDataOOM } -var _ = [1]int{}[int(StatusAPIGroupUpdateSkipped)-(len(statusSortBuckets)-1)] // Ensure list length matches +var _ = [1]int{}[int(StatusDataOOM)-(len(statusSortBuckets)-1)] // Ensure list length matches func (code StatusCode) String() string { if int(code) < 0 || int(code) >= len(statusCodes) { diff --git a/pkg/operator/k8s/pod.go b/pkg/operator/k8s/pod.go index 42ef326967..e58abfcbf9 100644 --- a/pkg/operator/k8s/pod.go +++ b/pkg/operator/k8s/pod.go @@ -40,6 +40,7 @@ const ( PodStatusFailed = "Failed" PodStatusKilled = "Killed" PodStatusUnknown = "Unknown" + PodStatusOOM = "Out of Memory" ) var killStatuses = map[int32]bool{ @@ -108,11 +109,17 @@ func GetPodStatus(pod *corev1.Pod) string { for _, containerStatus := range pod.Status.ContainerStatuses { if containerStatus.LastTerminationState.Terminated != nil { exitCode := containerStatus.LastTerminationState.Terminated.ExitCode + if exitCode == 137 { + return PodStatusOOM + } if killStatuses[exitCode] { return PodStatusKilled } } else if containerStatus.State.Terminated != nil { exitCode := containerStatus.State.Terminated.ExitCode + if exitCode == 137 { + return PodStatusOOM + } if killStatuses[exitCode] { return PodStatusKilled } diff --git a/pkg/operator/workloads/data_status.go b/pkg/operator/workloads/data_status.go index b80241418f..ec5310bb24 100644 --- a/pkg/operator/workloads/data_status.go +++ b/pkg/operator/workloads/data_status.go @@ -77,6 +77,8 @@ func dataStatusCode(dataSavedStatus *resource.DataSavedStatus) resource.StatusCo return resource.StatusDataFailed case resource.ExitCodeDataKilled: return resource.StatusDataKilled + case resource.ExitCodeDataOOM: + return resource.StatusDataOOM } return resource.StatusUnknown diff --git a/pkg/operator/workloads/workload_spec.go b/pkg/operator/workloads/workload_spec.go index ac1c717ef9..7e390e47ff 100644 --- a/pkg/operator/workloads/workload_spec.go +++ b/pkg/operator/workloads/workload_spec.go @@ -150,9 +150,13 @@ func UpdateDataWorkflowErrors(failedPods []corev1.Pod) error { savedStatus.Start = nowTime } - savedStatus.ExitCode = resource.ExitCodeDataFailed - if k8s.GetPodStatus(&pod) == k8s.PodStatusKilled { + switch k8s.GetPodStatus(&pod) { + case k8s.PodStatusKilled: savedStatus.ExitCode = resource.ExitCodeDataKilled + case k8s.PodStatusOOM: + savedStatus.ExitCode = resource.ExitCodeDataOOM + default: + savedStatus.ExitCode = resource.ExitCodeDataFailed } savedStatusesToUpload = append(savedStatusesToUpload, savedStatus) From 9ac91477ed62ed11bb85870907dac30dddb83462 Mon Sep 17 00:00:00 2001 From: Ivan Zhang Date: Fri, 22 Mar 2019 12:59:45 -0400 Subject: [PATCH 2/4] address comments --- pkg/api/resource/status.go | 12 ++++++------ pkg/operator/k8s/pod.go | 6 +++--- pkg/operator/workloads/api_status.go | 2 +- pkg/operator/workloads/data_status.go | 2 +- pkg/operator/workloads/workload_spec.go | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/pkg/api/resource/status.go b/pkg/api/resource/status.go index ae9aaa3e80..f400246710 100644 --- a/pkg/api/resource/status.go +++ b/pkg/api/resource/status.go @@ -110,7 +110,7 @@ const ( StatusAPIGroupParentKilled StatusAPIGroupUpdateSkipped - StatusDataOOM + StatusDataKilledOOM ) var statusCodes = []string{ @@ -142,7 +142,7 @@ var statusCodes = []string{ "status_data_oom", } -var _ = [1]int{}[int(StatusDataOOM)-(len(statusCodes)-1)] // Ensure list length matches +var _ = [1]int{}[int(StatusDataKilledOOM)-(len(statusCodes)-1)] // Ensure list length matches var statusCodeMessages = []string{ "unknown", // StatusUnknown @@ -170,10 +170,10 @@ var statusCodeMessages = []string{ "upstream termination", // StatusAPIGroupParentKilled "update skipped", // StatusAPIGroupUpdateSkipped - "killed (oom)", // StatusDataOOM + "terminated (out of mem)", // StatusDataOOM } -var _ = [1]int{}[int(StatusDataOOM)-(len(statusCodeMessages)-1)] // Ensure list length matches +var _ = [1]int{}[int(StatusDataKilledOOM)-(len(statusCodeMessages)-1)] // Ensure list length matches // StatusDataRunning aliases const ( @@ -210,10 +210,10 @@ var statusSortBuckets = []int{ 2, // StatusAPIGroupParentKilled 2, // StatusAPIGroupUpdateSkipped - 1, // StatusDataOOM + 1, // StatusDataKilledOOM } -var _ = [1]int{}[int(StatusDataOOM)-(len(statusSortBuckets)-1)] // Ensure list length matches +var _ = [1]int{}[int(StatusDataKilledOOM)-(len(statusSortBuckets)-1)] // Ensure list length matches func (code StatusCode) String() string { if int(code) < 0 || int(code) >= len(statusCodes) { diff --git a/pkg/operator/k8s/pod.go b/pkg/operator/k8s/pod.go index e58abfcbf9..72f87b48c1 100644 --- a/pkg/operator/k8s/pod.go +++ b/pkg/operator/k8s/pod.go @@ -40,7 +40,7 @@ const ( PodStatusFailed = "Failed" PodStatusKilled = "Killed" PodStatusUnknown = "Unknown" - PodStatusOOM = "Out of Memory" + PodStatusKilledOOM = "Out of Memory" ) var killStatuses = map[int32]bool{ @@ -110,7 +110,7 @@ func GetPodStatus(pod *corev1.Pod) string { if containerStatus.LastTerminationState.Terminated != nil { exitCode := containerStatus.LastTerminationState.Terminated.ExitCode if exitCode == 137 { - return PodStatusOOM + return PodStatusKilledOOM } if killStatuses[exitCode] { return PodStatusKilled @@ -118,7 +118,7 @@ func GetPodStatus(pod *corev1.Pod) string { } else if containerStatus.State.Terminated != nil { exitCode := containerStatus.State.Terminated.ExitCode if exitCode == 137 { - return PodStatusOOM + return PodStatusKilledOOM } if killStatuses[exitCode] { return PodStatusKilled diff --git a/pkg/operator/workloads/api_status.go b/pkg/operator/workloads/api_status.go index 05a50a1fe8..32245ec715 100644 --- a/pkg/operator/workloads/api_status.go +++ b/pkg/operator/workloads/api_status.go @@ -194,7 +194,7 @@ func updateAPIStatusCodeByParents(apiStatus *resource.APIStatus, dataStatuses ma parentSkipped := false for dependency := range allDependencies { switch dataStatuses[dependency].Code { - case resource.StatusDataKilled: + case resource.StatusDataKilled, resource.StatusDataKilledOOM: apiStatus.Code = resource.StatusParentKilled return case resource.StatusDataFailed: diff --git a/pkg/operator/workloads/data_status.go b/pkg/operator/workloads/data_status.go index ec5310bb24..1d70b41102 100644 --- a/pkg/operator/workloads/data_status.go +++ b/pkg/operator/workloads/data_status.go @@ -78,7 +78,7 @@ func dataStatusCode(dataSavedStatus *resource.DataSavedStatus) resource.StatusCo case resource.ExitCodeDataKilled: return resource.StatusDataKilled case resource.ExitCodeDataOOM: - return resource.StatusDataOOM + return resource.StatusDataKilledOOM } return resource.StatusUnknown diff --git a/pkg/operator/workloads/workload_spec.go b/pkg/operator/workloads/workload_spec.go index 7e390e47ff..3176b9e348 100644 --- a/pkg/operator/workloads/workload_spec.go +++ b/pkg/operator/workloads/workload_spec.go @@ -153,7 +153,7 @@ func UpdateDataWorkflowErrors(failedPods []corev1.Pod) error { switch k8s.GetPodStatus(&pod) { case k8s.PodStatusKilled: savedStatus.ExitCode = resource.ExitCodeDataKilled - case k8s.PodStatusOOM: + case k8s.PodStatusKilledOOM: savedStatus.ExitCode = resource.ExitCodeDataOOM default: savedStatus.ExitCode = resource.ExitCodeDataFailed From a6804aef53996eb0b500313ebb1492b07578ac04 Mon Sep 17 00:00:00 2001 From: Ivan Zhang Date: Fri, 22 Mar 2019 16:29:08 -0400 Subject: [PATCH 3/4] add parent terminated status to data --- pkg/operator/workloads/data_status.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/operator/workloads/data_status.go b/pkg/operator/workloads/data_status.go index 1d70b41102..930c03d483 100644 --- a/pkg/operator/workloads/data_status.go +++ b/pkg/operator/workloads/data_status.go @@ -93,7 +93,7 @@ func updateDataStatusCodeByParents(dataStatus *resource.DataStatus, dataStatuses parentSkipped := false for dependency := range allDependencies { switch dataStatuses[dependency].Code { - case resource.StatusDataKilled: + case resource.StatusDataKilled, resource.StatusDataKilledOOM: dataStatus.Code = resource.StatusParentKilled return case resource.StatusDataFailed: From 2fffa54d16169b795ed61826566a3ed0acc247ad Mon Sep 17 00:00:00 2001 From: Ivan Zhang Date: Fri, 22 Mar 2019 16:33:59 -0400 Subject: [PATCH 4/4] move unknown up --- pkg/operator/k8s/pod.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/operator/k8s/pod.go b/pkg/operator/k8s/pod.go index 72f87b48c1..8762534add 100644 --- a/pkg/operator/k8s/pod.go +++ b/pkg/operator/k8s/pod.go @@ -33,13 +33,13 @@ var podTypeMeta = metav1.TypeMeta{ } const ( + PodStatusUnknown = "Unknown" PodStatusPending = "Pending" PodStatusRunning = "Running" PodStatusTerminating = "Terminating" PodStatusSucceeded = "Succeeded" PodStatusFailed = "Failed" PodStatusKilled = "Killed" - PodStatusUnknown = "Unknown" PodStatusKilledOOM = "Out of Memory" )