From b5321f7d8b9c999fad89f60b745402e4a2067b63 Mon Sep 17 00:00:00 2001 From: Pranshu Srivastava Date: Fri, 10 Oct 2025 03:21:29 +0530 Subject: [PATCH 1/6] OCPBUGS-62703: Relax duplicate events detection for Prometheus Overrides the duplicate readiness error events' limit for Prometheus during upgrades. Since Prometheus needs some time to wind down (see [1]), it was causing Kubelet to exhibit readiness error events during the time span it took to terminate. This ignores those pings to a limit (100). [1]: https://github.com/prometheus-operator/prometheus-operator/blob/d0ae00fdedc656a5a1a290d9839b84d860f15428/pkg/prometheus/common.go#L56-L59 Signed-off-by: Pranshu Srivastava --- .../duplicated_event_patterns.go | 58 +++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go index 52e54b0ca82b..a95a79aa27a3 100644 --- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go +++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go @@ -502,6 +502,9 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva twoNodeEtcdEndpointsMatcher := newTwoNodeEtcdEndpointsConfigMissingEventMatcher(finalIntervals) registry.AddPathologicalEventMatcherOrDie(twoNodeEtcdEndpointsMatcher) + prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher := newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals) + registry.AddPathologicalEventMatcherOrDie(prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher) + return registry } @@ -1171,3 +1174,58 @@ func newCrioReloadedTooOftenEventMatcher(finalInternals monitorapi.Intervals) Ev allowIfWithinIntervals: crioReloadedIntervals, } } + +func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals monitorapi.Intervals) EventMatcher { + statefulSetName := "prometheus-k8s" + statefulSetNamespace := "openshift-monitoring" + messageHumanizedSubstring := "Readiness probe errored: rpc error" + messageReason := "Unhealthy" + matcher := &SimplePathologicalEventMatcher{ + name: "PrometheusReadinessProbeErrorsDuringUpgrades", + locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{ + monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^` + statefulSetNamespace + `$`), + monitorapi.LocatorStatefulSetKey: regexp.MustCompile(`^` + statefulSetName + `$`), + }, + messageReasonRegex: regexp.MustCompile(`^` + messageReason + `$`), + messageHumanRegex: regexp.MustCompile(messageHumanizedSubstring), + jira: "https://issues.redhat.com/browse/OCPBUGS-62703", + } + + // Sanity check in case no `finalIntervals` are provided. + if finalIntervals == nil || len(finalIntervals) == 0 { + matcher.neverAllow = true + return matcher + } + + /* + 05:50:32 openshift-monitoring kubelet prometheus-k8s-1 + Unhealthy + Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found + + 05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0 + Unhealthy + Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 + */ + testIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool { + return eventInterval.Locator.Type == monitorapi.LocatorTypeStatefulSet && + eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == statefulSetNamespace && + eventInterval.Locator.Keys[monitorapi.LocatorStatefulSetKey] == statefulSetName && + eventInterval.Message.Reason == monitorapi.IntervalReason(messageReason) && + strings.Contains(eventInterval.Message.HumanMessage, messageHumanizedSubstring) + }) + + if len(testIntervals) > 0 { + // Readiness probe errors are expected during upgrades, allow a higher threshold. + // Set the threshold to 100 to allow for a high number of readiness probe errors + // during the upgrade, but not so high that we would miss a real problem, i.e., + // the job below (and usually) hit ~60 readiness errors during the upgrade, + // https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, + // However, the job below hit readiness errors 774 times during the upgrade, + // https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856. + matcher.repeatThresholdOverride = 100 + } else { + matcher.neverAllow = true + } + + return matcher +} From 0d1ade8b885782a6c6c78e8284ad9b024be165d7 Mon Sep 17 00:00:00 2001 From: Pranshu Srivastava Date: Wed, 29 Oct 2025 00:34:39 +0530 Subject: [PATCH 2/6] fixup! OCPBUGS-62703: Relax duplicate events detection for Prometheus --- pkg/monitor/monitorapi/types.go | 2 ++ .../duplicated_event_patterns.go | 26 +++++++++++-------- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/pkg/monitor/monitorapi/types.go b/pkg/monitor/monitorapi/types.go index bf68bcd45a5a..185a36e76944 100644 --- a/pkg/monitor/monitorapi/types.go +++ b/pkg/monitor/monitorapi/types.go @@ -251,6 +251,8 @@ const ( FailedToAuthenticateWithOpenShiftUser IntervalReason = "FailedToAuthenticateWithOpenShiftUser" FailedContactingAPIReason IntervalReason = "FailedContactingAPI" + UnhealthyReason IntervalReason = "Unhealthy" + UpgradeStartedReason IntervalReason = "UpgradeStarted" UpgradeVersionReason IntervalReason = "UpgradeVersion" UpgradeRollbackReason IntervalReason = "UpgradeRollback" diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go index a95a79aa27a3..c6a6ef58417f 100644 --- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go +++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go @@ -1176,17 +1176,17 @@ func newCrioReloadedTooOftenEventMatcher(finalInternals monitorapi.Intervals) Ev } func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals monitorapi.Intervals) EventMatcher { - statefulSetName := "prometheus-k8s" - statefulSetNamespace := "openshift-monitoring" - messageHumanizedSubstring := "Readiness probe errored: rpc error" - messageReason := "Unhealthy" + podNamePrefix := "prometheus-k8s" + podNamespace := "openshift-monitoring" + messageHumanizedSubstring := "Readiness probe errored" + messageReason := monitorapi.UnhealthyReason matcher := &SimplePathologicalEventMatcher{ name: "PrometheusReadinessProbeErrorsDuringUpgrades", locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{ - monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^` + statefulSetNamespace + `$`), - monitorapi.LocatorStatefulSetKey: regexp.MustCompile(`^` + statefulSetName + `$`), + monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^` + podNamespace + `$`), + monitorapi.LocatorPodKey: regexp.MustCompile(`^` + podNamePrefix + `-[0,1]$`), }, - messageReasonRegex: regexp.MustCompile(`^` + messageReason + `$`), + messageReasonRegex: regexp.MustCompile(`^` + string(messageReason) + `$`), messageHumanRegex: regexp.MustCompile(messageHumanizedSubstring), jira: "https://issues.redhat.com/browse/OCPBUGS-62703", } @@ -1205,12 +1205,16 @@ func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(fin 05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0 Unhealthy Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 + + 11:44:16 (x56) openshift-monitoring kubelet prometheus-k8s-0 + Unhealthy + Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 */ testIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool { - return eventInterval.Locator.Type == monitorapi.LocatorTypeStatefulSet && - eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == statefulSetNamespace && - eventInterval.Locator.Keys[monitorapi.LocatorStatefulSetKey] == statefulSetName && - eventInterval.Message.Reason == monitorapi.IntervalReason(messageReason) && + return eventInterval.Locator.Type == monitorapi.LocatorTypePod && + eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == podNamespace && + strings.HasPrefix(eventInterval.Locator.Keys[monitorapi.LocatorPodKey], podNamePrefix) && + eventInterval.Message.Reason == messageReason && strings.Contains(eventInterval.Message.HumanMessage, messageHumanizedSubstring) }) From 785a37c77c174581a578ec880643a7dc7513447d Mon Sep 17 00:00:00 2001 From: Pranshu Srivastava Date: Thu, 30 Oct 2025 12:51:49 +0530 Subject: [PATCH 3/6] fixup! fixup! OCPBUGS-62703: Relax duplicate events detection for Prometheus --- .../duplicated_event_patterns.go | 18 +-- .../duplicated_events_test.go | 115 ++++++++++++++++++ 2 files changed, 126 insertions(+), 7 deletions(-) diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go index c6a6ef58417f..3eb016b2ed16 100644 --- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go +++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go @@ -1219,13 +1219,17 @@ func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(fin }) if len(testIntervals) > 0 { - // Readiness probe errors are expected during upgrades, allow a higher threshold. - // Set the threshold to 100 to allow for a high number of readiness probe errors - // during the upgrade, but not so high that we would miss a real problem, i.e., - // the job below (and usually) hit ~60 readiness errors during the upgrade, - // https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, - // However, the job below hit readiness errors 774 times during the upgrade, - // https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856. + /* + Readiness probes run during the lifecycle of the container, including termination. + Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20). + With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades. + + To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem. + The job below hit ~60 readiness errors during the upgrade: + https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore, + However, the job below hit readiness errors 774 times during the upgrade: + https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught. + */ matcher.repeatThresholdOverride = 100 } else { matcher.neverAllow = true diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go index 25bd2d5ce41e..30f9b18f2900 100644 --- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go +++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go @@ -2,6 +2,7 @@ package pathologicaleventlibrary import ( _ "embed" + "fmt" "testing" "time" @@ -666,3 +667,117 @@ func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) { }) } } + +func TestPathologicalEventsPrometheusReadinessProbeErrorsDuringUpgrades(t *testing.T) { + const namespace = "openshift-monitoring" + + unhealthyReasonPathologicalMessageWithHumanMessage := func(humanMessage string, repetitionCount int) monitorapi.Message { + return monitorapi.Message{ + Reason: monitorapi.UnhealthyReason, + HumanMessage: humanMessage, + Annotations: map[monitorapi.AnnotationKey]string{ + monitorapi.AnnotationCount: fmt.Sprintf("%d", repetitionCount), + monitorapi.AnnotationPathological: "true", + }, + } + } + + openshiftMonitoringNsLocatorWithPodKey := func(pod string) monitorapi.Locator { + return monitorapi.Locator{ + Type: monitorapi.LocatorTypePod, + Keys: map[monitorapi.LocatorKey]string{ + monitorapi.LocatorNamespaceKey: "openshift-monitoring", + monitorapi.LocatorPodKey: pod, + }, + } + } + + tests := []struct { + name string + intervals []monitorapi.Interval + expectedMessage string + }{ + { + name: "Readiness probe error (stopping container) on first Prometheus pod", + intervals: []monitorapi.Interval{ + { + Condition: monitorapi.Condition{ + Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-0"), + Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100), + }, + }, + }, + }, + { + name: "Readiness probe error (terminated container) on second Prometheus pod", + intervals: []monitorapi.Interval{ + { + Condition: monitorapi.Condition{ + Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"), + Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found", 100), + }, + }, + }, + }, + { + name: "Readiness probe error (stopping container, different human message) on second Prometheus pod", + intervals: []monitorapi.Interval{ + { + Condition: monitorapi.Condition{ + Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"), + Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100), + }, + }, + }, + }, + { + name: "Readiness probe error (stopping container, different human message) on non-existent Prometheus pod should not be ignored", + intervals: []monitorapi.Interval{ + { + Condition: monitorapi.Condition{ + Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-2"), + Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100), + }, + }, + }, + expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-2 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ", + }, + { + name: "Readiness probe error (stopping container, different human message) on a Prometheus pod should not be ignored above the acceptable limit", + intervals: []monitorapi.Interval{ + { + Condition: monitorapi.Condition{ + Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"), + Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 101), + }, + }, + }, + expectedMessage: "1 events happened too frequently\n\nevent happened 101 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ", + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + events := monitorapi.Intervals(test.intervals) + evaluator := duplicateEventsEvaluator{ + registry: NewUpgradePathologicalEventMatchers(nil, events), + } + + testName := "events should not repeat" + junits := evaluator.testDuplicatedEvents(testName, false, events, nil, false) + jUnitName := getJUnitName(testName, namespace) + for _, junit := range junits { + if junit.Name == jUnitName { + if test.expectedMessage != "" { + require.NotNil(t, junit.FailureOutput, "expected junit to have failure output") + require.Equal(t, test.expectedMessage, junit.FailureOutput.Output) + } else { + require.Nil(t, junit.FailureOutput, "expected success but got failure output for junit: %s", junit.Name) + } + + break + } + } + }) + } +} From 8753df3c814950fb47e74392f9aa82292b503c17 Mon Sep 17 00:00:00 2001 From: Pranshu Srivastava Date: Tue, 4 Nov 2025 16:07:40 +0530 Subject: [PATCH 4/6] fixup! fixup! fixup! OCPBUGS-62703: Relax duplicate events detection for Prometheus --- .../duplicated_event_patterns.go | 101 +++++++----------- 1 file changed, 36 insertions(+), 65 deletions(-) diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go index 3eb016b2ed16..44cbc2986231 100644 --- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go +++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go @@ -502,8 +502,42 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva twoNodeEtcdEndpointsMatcher := newTwoNodeEtcdEndpointsConfigMissingEventMatcher(finalIntervals) registry.AddPathologicalEventMatcherOrDie(twoNodeEtcdEndpointsMatcher) - prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher := newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals) - registry.AddPathologicalEventMatcherOrDie(prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher) + registry.AddPathologicalEventMatcherOrDie(&SimplePathologicalEventMatcher{ + name: "PrometheusReadinessProbeErrors", + locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{ + monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^openshift-monitoring$`), + monitorapi.LocatorPodKey: regexp.MustCompile(`^prometheus-k8s-[0,1]$`), + }, + messageReasonRegex: regexp.MustCompile(`^` + string(monitorapi.UnhealthyReason) + `$`), + messageHumanRegex: regexp.MustCompile("Readiness probe errored"), + jira: "https://issues.redhat.com/browse/OCPBUGS-62703", + /* + 05:50:32 openshift-monitoring kubelet prometheus-k8s-1 + Unhealthy + Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found + + 05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0 + Unhealthy + Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 + + 11:44:16 (x56) openshift-monitoring kubelet prometheus-k8s-0 + Unhealthy + Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 + + Readiness probes run during the lifecycle of the container, including termination. + Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20). + With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades. + + To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem. + The job below hit ~60 readiness errors during the upgrade: + https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore, + However, the job below hit readiness errors 774 times during the upgrade: + https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught. + + Also, do note that these events were exhibited outside of upgrades as well, so we need to allow them in general. + */ + repeatThresholdOverride: 100, + }) return registry } @@ -1174,66 +1208,3 @@ func newCrioReloadedTooOftenEventMatcher(finalInternals monitorapi.Intervals) Ev allowIfWithinIntervals: crioReloadedIntervals, } } - -func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals monitorapi.Intervals) EventMatcher { - podNamePrefix := "prometheus-k8s" - podNamespace := "openshift-monitoring" - messageHumanizedSubstring := "Readiness probe errored" - messageReason := monitorapi.UnhealthyReason - matcher := &SimplePathologicalEventMatcher{ - name: "PrometheusReadinessProbeErrorsDuringUpgrades", - locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{ - monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^` + podNamespace + `$`), - monitorapi.LocatorPodKey: regexp.MustCompile(`^` + podNamePrefix + `-[0,1]$`), - }, - messageReasonRegex: regexp.MustCompile(`^` + string(messageReason) + `$`), - messageHumanRegex: regexp.MustCompile(messageHumanizedSubstring), - jira: "https://issues.redhat.com/browse/OCPBUGS-62703", - } - - // Sanity check in case no `finalIntervals` are provided. - if finalIntervals == nil || len(finalIntervals) == 0 { - matcher.neverAllow = true - return matcher - } - - /* - 05:50:32 openshift-monitoring kubelet prometheus-k8s-1 - Unhealthy - Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found - - 05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0 - Unhealthy - Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 - - 11:44:16 (x56) openshift-monitoring kubelet prometheus-k8s-0 - Unhealthy - Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 - */ - testIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool { - return eventInterval.Locator.Type == monitorapi.LocatorTypePod && - eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == podNamespace && - strings.HasPrefix(eventInterval.Locator.Keys[monitorapi.LocatorPodKey], podNamePrefix) && - eventInterval.Message.Reason == messageReason && - strings.Contains(eventInterval.Message.HumanMessage, messageHumanizedSubstring) - }) - - if len(testIntervals) > 0 { - /* - Readiness probes run during the lifecycle of the container, including termination. - Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20). - With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades. - - To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem. - The job below hit ~60 readiness errors during the upgrade: - https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore, - However, the job below hit readiness errors 774 times during the upgrade: - https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught. - */ - matcher.repeatThresholdOverride = 100 - } else { - matcher.neverAllow = true - } - - return matcher -} From ed24ec41ba177e7f5de6b0a04827b98fa600646f Mon Sep 17 00:00:00 2001 From: Pranshu Srivastava Date: Tue, 4 Nov 2025 17:43:27 +0530 Subject: [PATCH 5/6] fixup! fixup! fixup! fixup! OCPBUGS-62703: Relax duplicate events detection for Prometheus --- .../duplicated_events_test.go | 105 +++++++++--------- 1 file changed, 53 insertions(+), 52 deletions(-) diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go index 30f9b18f2900..45d9a702d201 100644 --- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go +++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go @@ -668,7 +668,7 @@ func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) { } } -func TestPathologicalEventsPrometheusReadinessProbeErrorsDuringUpgrades(t *testing.T) { +func TestPathologicalEventsPrometheusReadinessProbeErrors(t *testing.T) { const namespace = "openshift-monitoring" unhealthyReasonPathologicalMessageWithHumanMessage := func(humanMessage string, repetitionCount int) monitorapi.Message { @@ -682,11 +682,11 @@ func TestPathologicalEventsPrometheusReadinessProbeErrorsDuringUpgrades(t *testi } } - openshiftMonitoringNsLocatorWithPodKey := func(pod string) monitorapi.Locator { + nsLocatorWithPodKey := func(pod, ns string) monitorapi.Locator { return monitorapi.Locator{ Type: monitorapi.LocatorTypePod, Keys: map[monitorapi.LocatorKey]string{ - monitorapi.LocatorNamespaceKey: "openshift-monitoring", + monitorapi.LocatorNamespaceKey: ns, monitorapi.LocatorPodKey: pod, }, } @@ -694,78 +694,79 @@ func TestPathologicalEventsPrometheusReadinessProbeErrorsDuringUpgrades(t *testi tests := []struct { name string - intervals []monitorapi.Interval expectedMessage string + pod string + ns string + humanMessage string + repetitionCount int }{ { - name: "Readiness probe error (stopping container) on first Prometheus pod", - intervals: []monitorapi.Interval{ - { - Condition: monitorapi.Condition{ - Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-0"), - Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100), - }, - }, - }, + name: "Readiness probe error (stopping container) on first Prometheus pod", + expectedMessage: "", + pod: "prometheus-k8s-0", + ns: namespace, + humanMessage: "Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", + repetitionCount: 100, }, { - name: "Readiness probe error (terminated container) on second Prometheus pod", - intervals: []monitorapi.Interval{ - { - Condition: monitorapi.Condition{ - Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"), - Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found", 100), - }, - }, - }, + name: "Readiness probe error (terminated container) on second Prometheus pod", + expectedMessage: "", + pod: "prometheus-k8s-1", + ns: namespace, + humanMessage: "Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found", + repetitionCount: 100, }, { - name: "Readiness probe error (stopping container, different human message) on second Prometheus pod", - intervals: []monitorapi.Interval{ - { - Condition: monitorapi.Condition{ - Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"), - Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100), - }, - }, - }, + name: "Readiness probe error (stopping container, different human message) on second Prometheus pod", + expectedMessage: "", + pod: "prometheus-k8s-1", + ns: namespace, + humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", + repetitionCount: 100, }, { - name: "Readiness probe error (stopping container, different human message) on non-existent Prometheus pod should not be ignored", - intervals: []monitorapi.Interval{ - { - Condition: monitorapi.Condition{ - Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-2"), - Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100), - }, - }, - }, + name: "Readiness probe error (stopping container) on a Prometheus pod in a different namespace should not be ignored", + expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/foo pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ", + pod: "prometheus-k8s-1", + ns: "foo", + humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", + repetitionCount: 100, + }, + { + name: "Readiness probe error (stopping container) on non-existent Prometheus pod should not be ignored", expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-2 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ", + pod: "prometheus-k8s-2", + ns: namespace, + humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", + repetitionCount: 100, }, { - name: "Readiness probe error (stopping container, different human message) on a Prometheus pod should not be ignored above the acceptable limit", - intervals: []monitorapi.Interval{ - { - Condition: monitorapi.Condition{ - Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"), - Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 101), - }, - }, - }, + name: "Readiness probe error (stopping container, different human message) on a Prometheus pod should not be ignored above the acceptable limit", expectedMessage: "1 events happened too frequently\n\nevent happened 101 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ", + pod: "prometheus-k8s-1", + ns: namespace, + humanMessage: "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", + repetitionCount: 101, }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { - events := monitorapi.Intervals(test.intervals) + events := monitorapi.Intervals([]monitorapi.Interval{ + { + Condition: monitorapi.Condition{ + Locator: nsLocatorWithPodKey(test.pod, test.ns), + Message: unhealthyReasonPathologicalMessageWithHumanMessage(test.humanMessage, test.repetitionCount), + }, + }, + }) evaluator := duplicateEventsEvaluator{ - registry: NewUpgradePathologicalEventMatchers(nil, events), + registry: NewUniversalPathologicalEventMatchers(nil, events), } testName := "events should not repeat" junits := evaluator.testDuplicatedEvents(testName, false, events, nil, false) - jUnitName := getJUnitName(testName, namespace) + jUnitName := getJUnitName(testName, test.ns) for _, junit := range junits { if junit.Name == jUnitName { if test.expectedMessage != "" { From 85008d1964ba14a5e0509c5749d2478042b9ef15 Mon Sep 17 00:00:00 2001 From: Pranshu Srivastava Date: Wed, 5 Nov 2025 00:19:56 +0530 Subject: [PATCH 6/6] fixup! fixup! fixup! fixup! fixup! OCPBUGS-62703: Relax duplicate events detection for Prometheus --- .../duplicated_event_patterns.go | 73 +++++++++---------- .../duplicated_events_test.go | 2 +- 2 files changed, 37 insertions(+), 38 deletions(-) diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go index 44cbc2986231..7e86cac87ee5 100644 --- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go +++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go @@ -502,43 +502,6 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva twoNodeEtcdEndpointsMatcher := newTwoNodeEtcdEndpointsConfigMissingEventMatcher(finalIntervals) registry.AddPathologicalEventMatcherOrDie(twoNodeEtcdEndpointsMatcher) - registry.AddPathologicalEventMatcherOrDie(&SimplePathologicalEventMatcher{ - name: "PrometheusReadinessProbeErrors", - locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{ - monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^openshift-monitoring$`), - monitorapi.LocatorPodKey: regexp.MustCompile(`^prometheus-k8s-[0,1]$`), - }, - messageReasonRegex: regexp.MustCompile(`^` + string(monitorapi.UnhealthyReason) + `$`), - messageHumanRegex: regexp.MustCompile("Readiness probe errored"), - jira: "https://issues.redhat.com/browse/OCPBUGS-62703", - /* - 05:50:32 openshift-monitoring kubelet prometheus-k8s-1 - Unhealthy - Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found - - 05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0 - Unhealthy - Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 - - 11:44:16 (x56) openshift-monitoring kubelet prometheus-k8s-0 - Unhealthy - Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 - - Readiness probes run during the lifecycle of the container, including termination. - Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20). - With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades. - - To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem. - The job below hit ~60 readiness errors during the upgrade: - https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore, - However, the job below hit readiness errors 774 times during the upgrade: - https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught. - - Also, do note that these events were exhibited outside of upgrades as well, so we need to allow them in general. - */ - repeatThresholdOverride: 100, - }) - return registry } @@ -601,6 +564,42 @@ func NewUpgradePathologicalEventMatchers(kubeConfig *rest.Config, finalIntervals m := newFailedSchedulingDuringNodeUpdatePathologicalEventMatcher(finalIntervals) registry.AddPathologicalEventMatcherOrDie(m) + // Prometheus pods may have readiness probe errors during upgrades. + registry.AddPathologicalEventMatcherOrDie(&SimplePathologicalEventMatcher{ + name: "PrometheusReadinessProbeErrors", + locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{ + monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^openshift-monitoring$`), + monitorapi.LocatorPodKey: regexp.MustCompile(`^prometheus-k8s-[0,1]$`), + }, + messageReasonRegex: regexp.MustCompile(`^` + string(monitorapi.UnhealthyReason) + `$`), + messageHumanRegex: regexp.MustCompile("Readiness probe errored"), + jira: "https://issues.redhat.com/browse/OCPBUGS-62703", + /* + 05:50:32 openshift-monitoring kubelet prometheus-k8s-1 + Unhealthy + Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found + + 05:53:52 (x25) openshift-monitoring kubelet prometheus-k8s-0 + Unhealthy + Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 + + 11:44:16 (x56) openshift-monitoring kubelet prometheus-k8s-0 + Unhealthy + Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 + + Readiness probes run during the lifecycle of the container, including termination. + Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20). + With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades. + + To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem. + The job below hit ~60 readiness errors during the upgrade: + https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore, + However, the job below hit readiness errors 774 times during the upgrade: + https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught. + */ + repeatThresholdOverride: 100, + }) + return registry } diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go index 45d9a702d201..dbc9298eb3c8 100644 --- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go +++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go @@ -761,7 +761,7 @@ func TestPathologicalEventsPrometheusReadinessProbeErrors(t *testing.T) { }, }) evaluator := duplicateEventsEvaluator{ - registry: NewUniversalPathologicalEventMatchers(nil, events), + registry: NewUpgradePathologicalEventMatchers(nil, events), } testName := "events should not repeat"