From b5321f7d8b9c999fad89f60b745402e4a2067b63 Mon Sep 17 00:00:00 2001
From: Pranshu Srivastava <rexagod@gmail.com>
Date: Fri, 10 Oct 2025 03:21:29 +0530
Subject: [PATCH 1/6] OCPBUGS-62703: Relax duplicate events detection for
 Prometheus

Overrides the duplicate readiness error events' limit for Prometheus
during upgrades. Since Prometheus needs some time to wind down (see
[1]), it was causing Kubelet to exhibit readiness error events during
the time span it took to terminate. This ignores those pings to a limit (100).

[1]: https://github.com/prometheus-operator/prometheus-operator/blob/d0ae00fdedc656a5a1a290d9839b84d860f15428/pkg/prometheus/common.go#L56-L59

Signed-off-by: Pranshu Srivastava <rexagod@gmail.com>
---
 .../duplicated_event_patterns.go              | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
index 52e54b0ca82b..a95a79aa27a3 100644
--- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
+++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
@@ -502,6 +502,9 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva
 	twoNodeEtcdEndpointsMatcher := newTwoNodeEtcdEndpointsConfigMissingEventMatcher(finalIntervals)
 	registry.AddPathologicalEventMatcherOrDie(twoNodeEtcdEndpointsMatcher)
 
+	prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher := newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals)
+	registry.AddPathologicalEventMatcherOrDie(prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher)
+
 	return registry
 }
 
@@ -1171,3 +1174,58 @@ func newCrioReloadedTooOftenEventMatcher(finalInternals monitorapi.Intervals) Ev
 		allowIfWithinIntervals: crioReloadedIntervals,
 	}
 }
+
+func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals monitorapi.Intervals) EventMatcher {
+	statefulSetName := "prometheus-k8s"
+	statefulSetNamespace := "openshift-monitoring"
+	messageHumanizedSubstring := "Readiness probe errored: rpc error"
+	messageReason := "Unhealthy"
+	matcher := &SimplePathologicalEventMatcher{
+		name: "PrometheusReadinessProbeErrorsDuringUpgrades",
+		locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
+			monitorapi.LocatorNamespaceKey:   regexp.MustCompile(`^` + statefulSetNamespace + `$`),
+			monitorapi.LocatorStatefulSetKey: regexp.MustCompile(`^` + statefulSetName + `$`),
+		},
+		messageReasonRegex: regexp.MustCompile(`^` + messageReason + `$`),
+		messageHumanRegex:  regexp.MustCompile(messageHumanizedSubstring),
+		jira:               "https://issues.redhat.com/browse/OCPBUGS-62703",
+	}
+
+	// Sanity check in case no `finalIntervals` are provided.
+	if finalIntervals == nil || len(finalIntervals) == 0 {
+		matcher.neverAllow = true
+		return matcher
+	}
+
+	/*
+		05:50:32	openshift-monitoring	kubelet	prometheus-k8s-1
+		Unhealthy
+		Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
+
+		05:53:52 (x25)	openshift-monitoring	kubelet	prometheus-k8s-0
+		Unhealthy
+		Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
+	*/
+	testIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
+		return eventInterval.Locator.Type == monitorapi.LocatorTypeStatefulSet &&
+			eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == statefulSetNamespace &&
+			eventInterval.Locator.Keys[monitorapi.LocatorStatefulSetKey] == statefulSetName &&
+			eventInterval.Message.Reason == monitorapi.IntervalReason(messageReason) &&
+			strings.Contains(eventInterval.Message.HumanMessage, messageHumanizedSubstring)
+	})
+
+	if len(testIntervals) > 0 {
+		// Readiness probe errors are expected during upgrades, allow a higher threshold.
+		// Set the threshold to 100 to allow for a high number of readiness probe errors
+		// during the upgrade, but not so high that we would miss a real problem, i.e.,
+		// the job below (and usually) hit ~60 readiness errors during the upgrade,
+		// https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048,
+		// However, the job below hit readiness errors 774 times during the upgrade,
+		// https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856.
+		matcher.repeatThresholdOverride = 100
+	} else {
+		matcher.neverAllow = true
+	}
+
+	return matcher
+}

From 0d1ade8b885782a6c6c78e8284ad9b024be165d7 Mon Sep 17 00:00:00 2001
From: Pranshu Srivastava <rexagod@gmail.com>
Date: Wed, 29 Oct 2025 00:34:39 +0530
Subject: [PATCH 2/6] fixup! OCPBUGS-62703: Relax duplicate events detection
 for Prometheus

---
 pkg/monitor/monitorapi/types.go               |  2 ++
 .../duplicated_event_patterns.go              | 26 +++++++++++--------
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/pkg/monitor/monitorapi/types.go b/pkg/monitor/monitorapi/types.go
index bf68bcd45a5a..185a36e76944 100644
--- a/pkg/monitor/monitorapi/types.go
+++ b/pkg/monitor/monitorapi/types.go
@@ -251,6 +251,8 @@ const (
 	FailedToAuthenticateWithOpenShiftUser IntervalReason = "FailedToAuthenticateWithOpenShiftUser"
 	FailedContactingAPIReason             IntervalReason = "FailedContactingAPI"
 
+	UnhealthyReason IntervalReason = "Unhealthy"
+
 	UpgradeStartedReason  IntervalReason = "UpgradeStarted"
 	UpgradeVersionReason  IntervalReason = "UpgradeVersion"
 	UpgradeRollbackReason IntervalReason = "UpgradeRollback"
diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
index a95a79aa27a3..c6a6ef58417f 100644
--- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
+++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
@@ -1176,17 +1176,17 @@ func newCrioReloadedTooOftenEventMatcher(finalInternals monitorapi.Intervals) Ev
 }
 
 func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals monitorapi.Intervals) EventMatcher {
-	statefulSetName := "prometheus-k8s"
-	statefulSetNamespace := "openshift-monitoring"
-	messageHumanizedSubstring := "Readiness probe errored: rpc error"
-	messageReason := "Unhealthy"
+	podNamePrefix := "prometheus-k8s"
+	podNamespace := "openshift-monitoring"
+	messageHumanizedSubstring := "Readiness probe errored"
+	messageReason := monitorapi.UnhealthyReason
 	matcher := &SimplePathologicalEventMatcher{
 		name: "PrometheusReadinessProbeErrorsDuringUpgrades",
 		locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
-			monitorapi.LocatorNamespaceKey:   regexp.MustCompile(`^` + statefulSetNamespace + `$`),
-			monitorapi.LocatorStatefulSetKey: regexp.MustCompile(`^` + statefulSetName + `$`),
+			monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^` + podNamespace + `$`),
+			monitorapi.LocatorPodKey:       regexp.MustCompile(`^` + podNamePrefix + `-[0,1]$`),
 		},
-		messageReasonRegex: regexp.MustCompile(`^` + messageReason + `$`),
+		messageReasonRegex: regexp.MustCompile(`^` + string(messageReason) + `$`),
 		messageHumanRegex:  regexp.MustCompile(messageHumanizedSubstring),
 		jira:               "https://issues.redhat.com/browse/OCPBUGS-62703",
 	}
@@ -1205,12 +1205,16 @@ func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(fin
 		05:53:52 (x25)	openshift-monitoring	kubelet	prometheus-k8s-0
 		Unhealthy
 		Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
+
+		11:44:16 (x56)	openshift-monitoring	kubelet	prometheus-k8s-0
+		Unhealthy
+		Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
 	*/
 	testIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
-		return eventInterval.Locator.Type == monitorapi.LocatorTypeStatefulSet &&
-			eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == statefulSetNamespace &&
-			eventInterval.Locator.Keys[monitorapi.LocatorStatefulSetKey] == statefulSetName &&
-			eventInterval.Message.Reason == monitorapi.IntervalReason(messageReason) &&
+		return eventInterval.Locator.Type == monitorapi.LocatorTypePod &&
+			eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == podNamespace &&
+			strings.HasPrefix(eventInterval.Locator.Keys[monitorapi.LocatorPodKey], podNamePrefix) &&
+			eventInterval.Message.Reason == messageReason &&
 			strings.Contains(eventInterval.Message.HumanMessage, messageHumanizedSubstring)
 	})
 

From 785a37c77c174581a578ec880643a7dc7513447d Mon Sep 17 00:00:00 2001
From: Pranshu Srivastava <rexagod@gmail.com>
Date: Thu, 30 Oct 2025 12:51:49 +0530
Subject: [PATCH 3/6] fixup! fixup! OCPBUGS-62703: Relax duplicate events
 detection for Prometheus

---
 .../duplicated_event_patterns.go              |  18 +--
 .../duplicated_events_test.go                 | 115 ++++++++++++++++++
 2 files changed, 126 insertions(+), 7 deletions(-)

diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
index c6a6ef58417f..3eb016b2ed16 100644
--- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
+++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
@@ -1219,13 +1219,17 @@ func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(fin
 	})
 
 	if len(testIntervals) > 0 {
-		// Readiness probe errors are expected during upgrades, allow a higher threshold.
-		// Set the threshold to 100 to allow for a high number of readiness probe errors
-		// during the upgrade, but not so high that we would miss a real problem, i.e.,
-		// the job below (and usually) hit ~60 readiness errors during the upgrade,
-		// https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048,
-		// However, the job below hit readiness errors 774 times during the upgrade,
-		// https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856.
+		/*
+			Readiness probes run during the lifecycle of the container, including termination.
+			Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
+			With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
+
+			To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
+			The job below hit ~60 readiness errors during the upgrade:
+			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
+			However, the job below hit readiness errors 774 times during the upgrade:
+			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
+		*/
 		matcher.repeatThresholdOverride = 100
 	} else {
 		matcher.neverAllow = true
diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go
index 25bd2d5ce41e..30f9b18f2900 100644
--- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go
+++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go
@@ -2,6 +2,7 @@ package pathologicaleventlibrary
 
 import (
 	_ "embed"
+	"fmt"
 	"testing"
 	"time"
 
@@ -666,3 +667,117 @@ func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) {
 		})
 	}
 }
+
+func TestPathologicalEventsPrometheusReadinessProbeErrorsDuringUpgrades(t *testing.T) {
+	const namespace = "openshift-monitoring"
+
+	unhealthyReasonPathologicalMessageWithHumanMessage := func(humanMessage string, repetitionCount int) monitorapi.Message {
+		return monitorapi.Message{
+			Reason:       monitorapi.UnhealthyReason,
+			HumanMessage: humanMessage,
+			Annotations: map[monitorapi.AnnotationKey]string{
+				monitorapi.AnnotationCount:        fmt.Sprintf("%d", repetitionCount),
+				monitorapi.AnnotationPathological: "true",
+			},
+		}
+	}
+
+	openshiftMonitoringNsLocatorWithPodKey := func(pod string) monitorapi.Locator {
+		return monitorapi.Locator{
+			Type: monitorapi.LocatorTypePod,
+			Keys: map[monitorapi.LocatorKey]string{
+				monitorapi.LocatorNamespaceKey: "openshift-monitoring",
+				monitorapi.LocatorPodKey:       pod,
+			},
+		}
+	}
+
+	tests := []struct {
+		name            string
+		intervals       []monitorapi.Interval
+		expectedMessage string
+	}{
+		{
+			name: "Readiness probe error (stopping container) on first Prometheus pod",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-0"),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
+					},
+				},
+			},
+		},
+		{
+			name: "Readiness probe error (terminated container) on second Prometheus pod",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found", 100),
+					},
+				},
+			},
+		},
+		{
+			name: "Readiness probe error (stopping container, different human message) on second Prometheus pod",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
+					},
+				},
+			},
+		},
+		{
+			name: "Readiness probe error (stopping container, different human message) on non-existent Prometheus pod should not be ignored",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-2"),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
+					},
+				},
+			},
+			expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-2 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
+		},
+		{
+			name: "Readiness probe error (stopping container, different human message) on a Prometheus pod should not be ignored above the acceptable limit",
+			intervals: []monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 101),
+					},
+				},
+			},
+			expectedMessage: "1 events happened too frequently\n\nevent happened 101 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			events := monitorapi.Intervals(test.intervals)
+			evaluator := duplicateEventsEvaluator{
+				registry: NewUpgradePathologicalEventMatchers(nil, events),
+			}
+
+			testName := "events should not repeat"
+			junits := evaluator.testDuplicatedEvents(testName, false, events, nil, false)
+			jUnitName := getJUnitName(testName, namespace)
+			for _, junit := range junits {
+				if junit.Name == jUnitName {
+					if test.expectedMessage != "" {
+						require.NotNil(t, junit.FailureOutput, "expected junit to have failure output")
+						require.Equal(t, test.expectedMessage, junit.FailureOutput.Output)
+					} else {
+						require.Nil(t, junit.FailureOutput, "expected success but got failure output for junit: %s", junit.Name)
+					}
+
+					break
+				}
+			}
+		})
+	}
+}

From 8753df3c814950fb47e74392f9aa82292b503c17 Mon Sep 17 00:00:00 2001
From: Pranshu Srivastava <rexagod@gmail.com>
Date: Tue, 4 Nov 2025 16:07:40 +0530
Subject: [PATCH 4/6] fixup! fixup! fixup! OCPBUGS-62703: Relax duplicate
 events detection for Prometheus

---
 .../duplicated_event_patterns.go              | 101 +++++++-----------
 1 file changed, 36 insertions(+), 65 deletions(-)

diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
index 3eb016b2ed16..44cbc2986231 100644
--- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
+++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
@@ -502,8 +502,42 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva
 	twoNodeEtcdEndpointsMatcher := newTwoNodeEtcdEndpointsConfigMissingEventMatcher(finalIntervals)
 	registry.AddPathologicalEventMatcherOrDie(twoNodeEtcdEndpointsMatcher)
 
-	prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher := newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals)
-	registry.AddPathologicalEventMatcherOrDie(prometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher)
+	registry.AddPathologicalEventMatcherOrDie(&SimplePathologicalEventMatcher{
+		name: "PrometheusReadinessProbeErrors",
+		locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
+			monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^openshift-monitoring$`),
+			monitorapi.LocatorPodKey:       regexp.MustCompile(`^prometheus-k8s-[0,1]$`),
+		},
+		messageReasonRegex: regexp.MustCompile(`^` + string(monitorapi.UnhealthyReason) + `$`),
+		messageHumanRegex:  regexp.MustCompile("Readiness probe errored"),
+		jira:               "https://issues.redhat.com/browse/OCPBUGS-62703",
+		/*
+			05:50:32	openshift-monitoring	kubelet	prometheus-k8s-1
+			Unhealthy
+			Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
+
+			05:53:52 (x25)	openshift-monitoring	kubelet	prometheus-k8s-0
+			Unhealthy
+			Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
+
+			11:44:16 (x56)	openshift-monitoring	kubelet	prometheus-k8s-0
+			Unhealthy
+			Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
+
+			Readiness probes run during the lifecycle of the container, including termination.
+			Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
+			With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
+
+			To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
+			The job below hit ~60 readiness errors during the upgrade:
+			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
+			However, the job below hit readiness errors 774 times during the upgrade:
+			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
+
+			Also, do note that these events were exhibited outside of upgrades as well, so we need to allow them in general.
+		*/
+		repeatThresholdOverride: 100,
+	})
 
 	return registry
 }
@@ -1174,66 +1208,3 @@ func newCrioReloadedTooOftenEventMatcher(finalInternals monitorapi.Intervals) Ev
 		allowIfWithinIntervals: crioReloadedIntervals,
 	}
 }
-
-func newPrometheusReadinessProbeErrorsDuringUpgradesPathologicalEventMatcher(finalIntervals monitorapi.Intervals) EventMatcher {
-	podNamePrefix := "prometheus-k8s"
-	podNamespace := "openshift-monitoring"
-	messageHumanizedSubstring := "Readiness probe errored"
-	messageReason := monitorapi.UnhealthyReason
-	matcher := &SimplePathologicalEventMatcher{
-		name: "PrometheusReadinessProbeErrorsDuringUpgrades",
-		locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
-			monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^` + podNamespace + `$`),
-			monitorapi.LocatorPodKey:       regexp.MustCompile(`^` + podNamePrefix + `-[0,1]$`),
-		},
-		messageReasonRegex: regexp.MustCompile(`^` + string(messageReason) + `$`),
-		messageHumanRegex:  regexp.MustCompile(messageHumanizedSubstring),
-		jira:               "https://issues.redhat.com/browse/OCPBUGS-62703",
-	}
-
-	// Sanity check in case no `finalIntervals` are provided.
-	if finalIntervals == nil || len(finalIntervals) == 0 {
-		matcher.neverAllow = true
-		return matcher
-	}
-
-	/*
-		05:50:32	openshift-monitoring	kubelet	prometheus-k8s-1
-		Unhealthy
-		Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
-
-		05:53:52 (x25)	openshift-monitoring	kubelet	prometheus-k8s-0
-		Unhealthy
-		Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
-
-		11:44:16 (x56)	openshift-monitoring	kubelet	prometheus-k8s-0
-		Unhealthy
-		Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
-	*/
-	testIntervals := finalIntervals.Filter(func(eventInterval monitorapi.Interval) bool {
-		return eventInterval.Locator.Type == monitorapi.LocatorTypePod &&
-			eventInterval.Locator.Keys[monitorapi.LocatorNamespaceKey] == podNamespace &&
-			strings.HasPrefix(eventInterval.Locator.Keys[monitorapi.LocatorPodKey], podNamePrefix) &&
-			eventInterval.Message.Reason == messageReason &&
-			strings.Contains(eventInterval.Message.HumanMessage, messageHumanizedSubstring)
-	})
-
-	if len(testIntervals) > 0 {
-		/*
-			Readiness probes run during the lifecycle of the container, including termination.
-			Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
-			With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
-
-			To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
-			The job below hit ~60 readiness errors during the upgrade:
-			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
-			However, the job below hit readiness errors 774 times during the upgrade:
-			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
-		*/
-		matcher.repeatThresholdOverride = 100
-	} else {
-		matcher.neverAllow = true
-	}
-
-	return matcher
-}

From ed24ec41ba177e7f5de6b0a04827b98fa600646f Mon Sep 17 00:00:00 2001
From: Pranshu Srivastava <rexagod@gmail.com>
Date: Tue, 4 Nov 2025 17:43:27 +0530
Subject: [PATCH 5/6] fixup! fixup! fixup! fixup! OCPBUGS-62703: Relax
 duplicate events detection for Prometheus

---
 .../duplicated_events_test.go                 | 105 +++++++++---------
 1 file changed, 53 insertions(+), 52 deletions(-)

diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go
index 30f9b18f2900..45d9a702d201 100644
--- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go
+++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go
@@ -668,7 +668,7 @@ func TestPathologicalEventsTopologyAwareHintsDisabled(t *testing.T) {
 	}
 }
 
-func TestPathologicalEventsPrometheusReadinessProbeErrorsDuringUpgrades(t *testing.T) {
+func TestPathologicalEventsPrometheusReadinessProbeErrors(t *testing.T) {
 	const namespace = "openshift-monitoring"
 
 	unhealthyReasonPathologicalMessageWithHumanMessage := func(humanMessage string, repetitionCount int) monitorapi.Message {
@@ -682,11 +682,11 @@ func TestPathologicalEventsPrometheusReadinessProbeErrorsDuringUpgrades(t *testi
 		}
 	}
 
-	openshiftMonitoringNsLocatorWithPodKey := func(pod string) monitorapi.Locator {
+	nsLocatorWithPodKey := func(pod, ns string) monitorapi.Locator {
 		return monitorapi.Locator{
 			Type: monitorapi.LocatorTypePod,
 			Keys: map[monitorapi.LocatorKey]string{
-				monitorapi.LocatorNamespaceKey: "openshift-monitoring",
+				monitorapi.LocatorNamespaceKey: ns,
 				monitorapi.LocatorPodKey:       pod,
 			},
 		}
@@ -694,78 +694,79 @@ func TestPathologicalEventsPrometheusReadinessProbeErrorsDuringUpgrades(t *testi
 
 	tests := []struct {
 		name            string
-		intervals       []monitorapi.Interval
 		expectedMessage string
+		pod             string
+		ns              string
+		humanMessage    string
+		repetitionCount int
 	}{
 		{
-			name: "Readiness probe error (stopping container) on first Prometheus pod",
-			intervals: []monitorapi.Interval{
-				{
-					Condition: monitorapi.Condition{
-						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-0"),
-						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
-					},
-				},
-			},
+			name:            "Readiness probe error (stopping container) on first Prometheus pod",
+			expectedMessage: "",
+			pod:             "prometheus-k8s-0",
+			ns:              namespace,
+			humanMessage:    "Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
+			repetitionCount: 100,
 		},
 		{
-			name: "Readiness probe error (terminated container) on second Prometheus pod",
-			intervals: []monitorapi.Interval{
-				{
-					Condition: monitorapi.Condition{
-						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
-						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found", 100),
-					},
-				},
-			},
+			name:            "Readiness probe error (terminated container) on second Prometheus pod",
+			expectedMessage: "",
+			pod:             "prometheus-k8s-1",
+			ns:              namespace,
+			humanMessage:    "Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found",
+			repetitionCount: 100,
 		},
 		{
-			name: "Readiness probe error (stopping container, different human message) on second Prometheus pod",
-			intervals: []monitorapi.Interval{
-				{
-					Condition: monitorapi.Condition{
-						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
-						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
-					},
-				},
-			},
+			name:            "Readiness probe error (stopping container, different human message) on second Prometheus pod",
+			expectedMessage: "",
+			pod:             "prometheus-k8s-1",
+			ns:              namespace,
+			humanMessage:    "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
+			repetitionCount: 100,
 		},
 		{
-			name: "Readiness probe error (stopping container, different human message) on non-existent Prometheus pod should not be ignored",
-			intervals: []monitorapi.Interval{
-				{
-					Condition: monitorapi.Condition{
-						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-2"),
-						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 100),
-					},
-				},
-			},
+			name:            "Readiness probe error (stopping container) on a Prometheus pod in a different namespace should not be ignored",
+			expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/foo pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
+			pod:             "prometheus-k8s-1",
+			ns:              "foo",
+			humanMessage:    "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
+			repetitionCount: 100,
+		},
+		{
+			name:            "Readiness probe error (stopping container) on non-existent Prometheus pod should not be ignored",
 			expectedMessage: "1 events happened too frequently\n\nevent happened 100 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-2 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
+			pod:             "prometheus-k8s-2",
+			ns:              namespace,
+			humanMessage:    "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
+			repetitionCount: 100,
 		},
 		{
-			name: "Readiness probe error (stopping container, different human message) on a Prometheus pod should not be ignored above the acceptable limit",
-			intervals: []monitorapi.Interval{
-				{
-					Condition: monitorapi.Condition{
-						Locator: openshiftMonitoringNsLocatorWithPodKey("prometheus-k8s-1"),
-						Message: unhealthyReasonPathologicalMessageWithHumanMessage("Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1", 101),
-					},
-				},
-			},
+			name:            "Readiness probe error (stopping container, different human message) on a Prometheus pod should not be ignored above the acceptable limit",
 			expectedMessage: "1 events happened too frequently\n\nevent happened 101 times, something is wrong: namespace/openshift-monitoring pod/prometheus-k8s-1 - reason/Unhealthy Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1 (00:00:00Z) result=reject ",
+			pod:             "prometheus-k8s-1",
+			ns:              namespace,
+			humanMessage:    "Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1",
+			repetitionCount: 101,
 		},
 	}
 
 	for _, test := range tests {
 		t.Run(test.name, func(t *testing.T) {
-			events := monitorapi.Intervals(test.intervals)
+			events := monitorapi.Intervals([]monitorapi.Interval{
+				{
+					Condition: monitorapi.Condition{
+						Locator: nsLocatorWithPodKey(test.pod, test.ns),
+						Message: unhealthyReasonPathologicalMessageWithHumanMessage(test.humanMessage, test.repetitionCount),
+					},
+				},
+			})
 			evaluator := duplicateEventsEvaluator{
-				registry: NewUpgradePathologicalEventMatchers(nil, events),
+				registry: NewUniversalPathologicalEventMatchers(nil, events),
 			}
 
 			testName := "events should not repeat"
 			junits := evaluator.testDuplicatedEvents(testName, false, events, nil, false)
-			jUnitName := getJUnitName(testName, namespace)
+			jUnitName := getJUnitName(testName, test.ns)
 			for _, junit := range junits {
 				if junit.Name == jUnitName {
 					if test.expectedMessage != "" {

From 85008d1964ba14a5e0509c5749d2478042b9ef15 Mon Sep 17 00:00:00 2001
From: Pranshu Srivastava <rexagod@gmail.com>
Date: Wed, 5 Nov 2025 00:19:56 +0530
Subject: [PATCH 6/6] fixup! fixup! fixup! fixup! fixup! OCPBUGS-62703: Relax
 duplicate events detection for Prometheus

---
 .../duplicated_event_patterns.go              | 73 +++++++++----------
 .../duplicated_events_test.go                 |  2 +-
 2 files changed, 37 insertions(+), 38 deletions(-)

diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
index 44cbc2986231..7e86cac87ee5 100644
--- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
+++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_event_patterns.go
@@ -502,43 +502,6 @@ func NewUniversalPathologicalEventMatchers(kubeConfig *rest.Config, finalInterva
 	twoNodeEtcdEndpointsMatcher := newTwoNodeEtcdEndpointsConfigMissingEventMatcher(finalIntervals)
 	registry.AddPathologicalEventMatcherOrDie(twoNodeEtcdEndpointsMatcher)
 
-	registry.AddPathologicalEventMatcherOrDie(&SimplePathologicalEventMatcher{
-		name: "PrometheusReadinessProbeErrors",
-		locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
-			monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^openshift-monitoring$`),
-			monitorapi.LocatorPodKey:       regexp.MustCompile(`^prometheus-k8s-[0,1]$`),
-		},
-		messageReasonRegex: regexp.MustCompile(`^` + string(monitorapi.UnhealthyReason) + `$`),
-		messageHumanRegex:  regexp.MustCompile("Readiness probe errored"),
-		jira:               "https://issues.redhat.com/browse/OCPBUGS-62703",
-		/*
-			05:50:32	openshift-monitoring	kubelet	prometheus-k8s-1
-			Unhealthy
-			Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
-
-			05:53:52 (x25)	openshift-monitoring	kubelet	prometheus-k8s-0
-			Unhealthy
-			Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
-
-			11:44:16 (x56)	openshift-monitoring	kubelet	prometheus-k8s-0
-			Unhealthy
-			Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
-
-			Readiness probes run during the lifecycle of the container, including termination.
-			Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
-			With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
-
-			To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
-			The job below hit ~60 readiness errors during the upgrade:
-			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
-			However, the job below hit readiness errors 774 times during the upgrade:
-			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
-
-			Also, do note that these events were exhibited outside of upgrades as well, so we need to allow them in general.
-		*/
-		repeatThresholdOverride: 100,
-	})
-
 	return registry
 }
 
@@ -601,6 +564,42 @@ func NewUpgradePathologicalEventMatchers(kubeConfig *rest.Config, finalIntervals
 	m := newFailedSchedulingDuringNodeUpdatePathologicalEventMatcher(finalIntervals)
 	registry.AddPathologicalEventMatcherOrDie(m)
 
+	// Prometheus pods may have readiness probe errors during upgrades.
+	registry.AddPathologicalEventMatcherOrDie(&SimplePathologicalEventMatcher{
+		name: "PrometheusReadinessProbeErrors",
+		locatorKeyRegexes: map[monitorapi.LocatorKey]*regexp.Regexp{
+			monitorapi.LocatorNamespaceKey: regexp.MustCompile(`^openshift-monitoring$`),
+			monitorapi.LocatorPodKey:       regexp.MustCompile(`^prometheus-k8s-[0,1]$`),
+		},
+		messageReasonRegex: regexp.MustCompile(`^` + string(monitorapi.UnhealthyReason) + `$`),
+		messageHumanRegex:  regexp.MustCompile("Readiness probe errored"),
+		jira:               "https://issues.redhat.com/browse/OCPBUGS-62703",
+		/*
+			05:50:32	openshift-monitoring	kubelet	prometheus-k8s-1
+			Unhealthy
+			Readiness probe errored: rpc error: code = NotFound desc = container is not created or running: checking if PID of 58577e7deb7b8ae87b8029b9988fa268613748d0743ce989748f27e52b199ef5 is running failed: container process not found
+
+			05:53:52 (x25)	openshift-monitoring	kubelet	prometheus-k8s-0
+			Unhealthy
+			Readiness probe errored: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
+
+			11:44:16 (x56)	openshift-monitoring	kubelet	prometheus-k8s-0
+			Unhealthy
+			Readiness probe errored and resulted in unknown state: rpc error: code = Unknown desc = command error: cannot register an exec PID: container is stopping, stdout: , stderr: , exit code -1
+
+			Readiness probes run during the lifecycle of the container, including termination.
+			Prometheus pods may take some time to stop, and thus result in more kubelet pings than permitted by default (20).
+			With a termination grace period of 600s, these pods may lead to probe errors (e.g. the web service is stopped but the process is still running), which is expected during upgrades.
+
+			To address this, set the threshold to 100 (approximately 600 (termination period) / 5 (probe interval)), to allow for a high number of readiness probe errors during the upgrade, but not so high that we would miss a real problem.
+			The job below hit ~60 readiness errors during the upgrade:
+			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.20-upgrade-from-stable-4.19-e2e-aws-ovn-upgrade/1977094149035266048, which makes sense to ignore,
+			However, the job below hit readiness errors 774 times during the upgrade:
+			https://prow.ci.openshift.org/view/gs/test-platform-results/logs/periodic-ci-openshift-release-master-ci-4.19-upgrade-from-stable-4.18-e2e-metal-ovn-single-node-rt-upgrade-test/1975691393640697856, which should be caught.
+		*/
+		repeatThresholdOverride: 100,
+	})
+
 	return registry
 }
 
diff --git a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go
index 45d9a702d201..dbc9298eb3c8 100644
--- a/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go
+++ b/pkg/monitortestlibrary/pathologicaleventlibrary/duplicated_events_test.go
@@ -761,7 +761,7 @@ func TestPathologicalEventsPrometheusReadinessProbeErrors(t *testing.T) {
 				},
 			})
 			evaluator := duplicateEventsEvaluator{
-				registry: NewUniversalPathologicalEventMatchers(nil, events),
+				registry: NewUpgradePathologicalEventMatchers(nil, events),
 			}
 
 			testName := "events should not repeat"