openshift · rexagod · Oct 9, 2025 · Oct 28, 2025 · Oct 30, 2025 · Nov 4, 2025
diff --git a/pkg/monitor/monitorapi/types.go b/pkg/monitor/monitorapi/types.go
@@ -251,6 +251,8 @@ const (
 	FailedToAuthenticateWithOpenShiftUser IntervalReason = "FailedToAuthenticateWithOpenShiftUser"
 	FailedContactingAPIReason             IntervalReason = "FailedContactingAPI"
 
+	UnhealthyReason IntervalReason = "Unhealthy"
+
 	UpgradeStartedReason  IntervalReason = "UpgradeStarted"
 	UpgradeVersionReason  IntervalReason = "UpgradeVersion"
 	UpgradeRollbackReason IntervalReason = "UpgradeRollback"

diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
@@ -502,6 +502,9 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva
 	twoNodeEtcdEndpointsMatcher := newTwoNodeEtcdEndpointsConfigMissingEventMatcher(finalIntervals)
 	registry.AddPathologicalEventMatcherOrDie(twoNodeEtcdEndpointsMatcher)
 
+	prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher := newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals)
+	registry.AddPathologicalEventMatcherOrDie(prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher)
+
 	return registry
 }
 
@@ -1171,3 +1174,66 @@ func newCrioReloadedTooOftenEventMatcher(finalInternals monitorapi.Intervals) Ev
 		allowIfWithinIntervals: crioReloadedIntervals,
 	}
 }
+
+func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals monitorapi.Intervals) EventMatcher {
+	podNamePrefix := "prometheus-k8s"
+	podNamespace := "openshift-monitoring"
+	messageHumanizedSubstring := "Readiness probe errored"
+	messageReason := monitorapi.UnhealthyReason
+	matcher := &SimplePathologicalEventMatcher{
+		name: "PrometheusReadinessProbeErrorsDuringUpgrades",
+		locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
+			monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^` + podNamespace + `$`),
+			monitorapi.LocatorPodKey:       regexp.MustCompile(`^` + podNamePrefix + `-[0,1]$`),
+		},
+		messageReasonRegex: regexp.MustCompile(`^` + string(messageReason) + `$`),
+		messageHumanRegex:  regexp.MustCompile(messageHumanizedSubstring),
+		jira:               "https://issues.redhat.com/browse/OCPBUGS-62703",
+	}
+
+	// Sanity check in case no `finalIntervals` are provided.
+	if finalIntervals == nil || len(finalIntervals) == 0 {
+		matcher.neverAllow = true
+		return matcher
+	}
+
+	/*
+		05:50:32	openshift-monitoring	kubelet	prometheus-k8s-1
+		Unhealthy
+		Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
+
+		05:53:52 (x25)	openshift-monitoring	kubelet	prometheus-k8s-0
+		Unhealthy
+		Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
+
+		11:44:16 (x56)	openshift-monitoring	kubelet	prometheus-k8s-0
+		Unhealthy
+		Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
+	*/
+	testIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
+		return eventInterval.Locator.Type == monitorapi.LocatorTypePod &&
+			eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == podNamespace &&
+			strings.HasPrefix(eventInterval.Locator.Keys[monitorapi.LocatorPodKey], podNamePrefix) &&
+			eventInterval.Message.Reason == messageReason &&
+			strings.Contains(eventInterval.Message.HumanMessage, messageHumanizedSubstring)
+	})
+
+	if len(testIntervals) > 0 {
+		/*
+			Readiness probes run during the lifecycle of the container, including termination.
+			Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
+			With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
+
+			To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
+			The job below hit ~60 readiness errors during the upgrade:
+			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
+			However, the job below hit readiness errors 774 times during the upgrade:
+			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
+		*/
+		matcher.repeatThresholdOverride = 100
+	} else {
+		matcher.neverAllow = true
+	}
+
+	return matcher
+}
diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go
@@ -2,6 +2,7 @@ package pathologicaleventlibrary
 
 import (
 	_ "embed"
+	"fmt"
 	"testing"
 	"time"
 
@@ -666,3 +667,117 @@ func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) {
 		})
 	}
 }
+
+func TestPathologicalEventsPrometheusReadinessProbeErrorsDuringUpgrades(t *testing.T) {
+	const namespace = "openshift-monitoring"
+
+	unhealthyReasonPathologicalMessageWithHumanMessage := func(humanMessage string, repetitionCount int) monitorapi.Message {
+		return monitorapi.Message{
+			Reason:       monitorapi.UnhealthyReason,
+			HumanMessage: humanMessage,
+			Annotations: map[monitorapi.AnnotationKey]string{
+				monitorapi.AnnotationCount:        fmt.Sprintf("%d", repetitionCount),
+				monitorapi.AnnotationPathological: "true",
+			},
+		}
+	}
+
+	openshiftMonitoringNsLocatorWithPodKey := func(pod string) monitorapi.Locator {
+		return monitorapi.Locator{
+			Type: monitorapi.LocatorTypePod,
+			Keys: map[monitorapi.LocatorKey]string{
+				monitorapi.LocatorNamespaceKey: "openshift-monitoring",
+				monitorapi.LocatorPodKey:       pod,
+			},
+		}
+	}
+
+	tests := []struct {
+		name            string
+		intervals       []monitorapi.Interval
+		expectedMessage string
+	}{
+		{
+			name: "Readiness probe error (stopping container) on first Prometheus pod",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-0"),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
+					},
+				},
+			},
+		},
+		{
+			name: "Readiness probe error (terminated container) on second Prometheus pod",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found", 100),
+					},
+				},
+			},
+		},
+		{
+			name: "Readiness probe error (stopping container, different human message) on second Prometheus pod",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
+					},
+				},
+			},
+		},
+		{
+			name: "Readiness probe error (stopping container, different human message) on non-existent Prometheus pod should not be ignored",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-2"),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
+					},
+				},
+			},
+			expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-2 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
+		},
+		{
+			name: "Readiness probe error (stopping container, different human message) on a Prometheus pod should not be ignored above the acceptable limit",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 101),
+					},
+				},
+			},
+			expectedMessage: "1 events happened too frequently\n\nevent happened 101 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			events := monitorapi.Intervals(test.intervals)
+			evaluator := duplicateEventsEvaluator{
+				registry: NewUpgradePathologicalEventMatchers(nil, events),
+			}
+
+			testName := "events should not repeat"
+			junits := evaluator.testDuplicatedEvents(testName, false, events, nil, false)
+			jUnitName := getJUnitName(testName, namespace)
+			for _, junit := range junits {
+				if junit.Name == jUnitName {
+					if test.expectedMessage != "" {
+						require.NotNil(t, junit.FailureOutput, "expected junit to have failure output")
+						require.Equal(t, test.expectedMessage, junit.FailureOutput.Output)
+					} else {
+						require.Nil(t, junit.FailureOutput, "expected success but got failure output for junit: %s", junit.Name)
+					}
+
+					break
+				}
+			}
+		})
+	}
+}