Address review comments: rework node and machine checks in needsRemediation() method

furkatgofurov7 · furkatgofurov7 · commit f96b74299588 · 2025-10-13T23:03:53.000+03:00
If both a node condition and machine condition are unhealthy, pick one reason but
combine all the messages

Signed-off-by: Furkat Gofurov &lt;furkat.gofurov@suse.com&gt;
diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go
@@ -19,6 +19,7 @@ package machinehealthcheck
 import (
 	"context"
 	"fmt"
+	"strings"
 	"time"
 
 	"github.com/go-logr/logr"
@@ -181,6 +182,13 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
 		return false, nextCheck
 	}
 
+	// Collect all unhealthy conditions (both node and machine) to provide comprehensive status
+	var (
+		unhealthyMessages       []string
+		unhealthyReasons        []string
+		foundUnhealthyCondition bool
+	)
+
 	// check node conditions
 	for _, c := range t.MHC.Spec.Checks.UnhealthyNodeConditions {
 		nodeCondition := getNodeCondition(t.Node, c.Type)
@@ -192,20 +200,15 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
 		}
 
 		// If the node condition has been in the unhealthy state for longer than the
-		// timeout, return true with no requeue time.
+		// timeout, mark as unhealthy and collect the message.
 		timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second
 
 		if nodeCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) {
-			v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.UnhealthyNodeConditionV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Condition %s on Node is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String())
-			logger.V(3).Info("Target is unhealthy: condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String())
-
-			conditions.Set(t.Machine, metav1.Condition{
-				Type:    clusterv1.MachineHealthCheckSucceededCondition,
-				Status:  metav1.ConditionFalse,
-				Reason:  clusterv1.MachineHealthCheckUnhealthyNodeReason,
-				Message: fmt.Sprintf("Health check failed: Condition %s on Node is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()),
-			})
-			return true, time.Duration(0)
+			foundUnhealthyCondition = true
+			unhealthyMessages = append(unhealthyMessages, fmt.Sprintf("Node condition %s is %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()))
+			unhealthyReasons = append(unhealthyReasons, "UnhealthyNode")
+			logger.V(3).Info("Target is unhealthy: node condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String())
+			continue
 		}
 
 		durationUnhealthy := now.Sub(nodeCondition.LastTransitionTime.Time)
@@ -226,20 +229,15 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
 		}
 
 		// If the machine condition has been in the unhealthy state for longer than the
-		// timeout, return true with no requeue time.
+		// timeout, mark as unhealthy and collect the message.
 		timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second
 
 		if machineCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) {
-			v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.UnhealthyMachineConditionV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Condition %s on Machine is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String())
-			logger.V(3).Info("Target is unhealthy: condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String())
-
-			conditions.Set(t.Machine, metav1.Condition{
-				Type:    clusterv1.MachineHealthCheckSucceededCondition,
-				Status:  metav1.ConditionFalse,
-				Reason:  clusterv1.MachineHealthCheckUnhealthyMachineReason,
-				Message: fmt.Sprintf("Health check failed: Condition %s on Machine is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()),
-			})
-			return true, time.Duration(0)
+			foundUnhealthyCondition = true
+			unhealthyMessages = append(unhealthyMessages, fmt.Sprintf("Machine condition %s is %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()))
+			unhealthyReasons = append(unhealthyReasons, "UnhealthyMachine")
+			logger.V(3).Info("Target is unhealthy: machine condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String())
+			continue
 		}
 
 		durationUnhealthy := now.Sub(machineCondition.LastTransitionTime.Time)
@@ -249,6 +247,52 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
 		}
 	}
 
+	// If any unhealthy conditions were found, set the combined status
+	if foundUnhealthyCondition {
+		// Determine the primary reason based on a consistent priority order:
+		// 1. If both node and machine conditions are present, use a combined reason
+		// 2. Otherwise use the specific reason for the type that failed
+		var primaryReason, v1beta1Reason string
+		if len(unhealthyReasons) > 0 {
+			// Check if we have both node and machine reasons
+			hasNodeReason := false
+			hasMachineReason := false
+			for _, reason := range unhealthyReasons {
+				switch reason {
+				case "UnhealthyNode":
+					hasNodeReason = true
+				case "UnhealthyMachine":
+					hasMachineReason = true
+				}
+			}
+
+			if hasNodeReason && hasMachineReason {
+				// Both types of conditions are unhealthy - use machine reason but indicate it's combined
+				primaryReason = clusterv1.MachineHealthCheckUnhealthyMachineReason
+				v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
+			} else if hasMachineReason {
+				primaryReason = clusterv1.MachineHealthCheckUnhealthyMachineReason
+				v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
+			} else if hasNodeReason {
+				primaryReason = clusterv1.MachineHealthCheckUnhealthyNodeReason
+				v1beta1Reason = clusterv1.UnhealthyNodeConditionV1Beta1Reason
+			}
+		}
+
+		// Combine all messages into a single comprehensive message
+		combinedMessage := fmt.Sprintf("Health check failed: %s", strings.Join(unhealthyMessages, "; "))
+
+		v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", combinedMessage)
+
+		conditions.Set(t.Machine, metav1.Condition{
+			Type:    clusterv1.MachineHealthCheckSucceededCondition,
+			Status:  metav1.ConditionFalse,
+			Reason:  primaryReason,
+			Message: combinedMessage,
+		})
+		return true, time.Duration(0)
+	}
+
 	return false, minDuration(nextCheckTimes)
 }
 
diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go
@@ -390,7 +390,7 @@ func TestHealthCheckTargets(t *testing.T) {
 		Node:        testNodeUnknown400,
 		nodeMissing: false,
 	}
-	nodeUnknown400Condition := newFailedHealthCheckV1Beta1Condition(clusterv1.UnhealthyNodeConditionV1Beta1Reason, "Condition Ready on node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String())
+	nodeUnknown400Condition := newFailedHealthCheckV1Beta1Condition(clusterv1.UnhealthyNodeConditionV1Beta1Reason, "Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String())
 	nodeUnknown400V1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckUnhealthyNodeReason, "Health check failed: Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String())
 
 	// Target for when a node is healthy
@@ -425,7 +425,7 @@ func TestHealthCheckTargets(t *testing.T) {
 	}
 	machineUnhealthy400Condition := newFailedHealthCheckV1Beta1Condition(
 		clusterv1.UnhealthyMachineConditionV1Beta1Reason,
-		"Condition EtcdPodHealthy on machine is reporting status False for more than %s",
+		"Condition EtcdPodHealthy on Machine is reporting status False for more than %s",
 		(time.Duration(timeoutForUnhealthyMachineConditions) * time.Second).String(),
 	)
 	machineUnhealthy400V1Beta2Condition := newFailedHealthCheckCondition(
diff --git a/internal/webhooks/machinehealthcheck_test.go b/internal/webhooks/machinehealthcheck_test.go
@@ -301,7 +301,7 @@ func TestMachineHealthCheckUnhealthyMachineConditions(t *testing.T) {
 			expectErr:                  false,
 		},
 		{
-			name:                       "do not fail if the UnhealthyMachineCondition array is nil",
+			name:                       "do not fail if the UnhealthyMachineCondition array is empty",
 			unhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{},
 			expectErr:                  false,
 		},

Original file line number	Diff line number	Diff line change
`@@ -390,7 +390,7 @@ func TestHealthCheckTargets(t *testing.T) {`
`390`	`390`	`Node: testNodeUnknown400,`
`391`	`391`	`nodeMissing: false,`
`392`	`392`	`}`
`393`		`- nodeUnknown400Condition := newFailedHealthCheckV1Beta1Condition(clusterv1.UnhealthyNodeConditionV1Beta1Reason, "Condition Ready on node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String())`
	`393`	`+ nodeUnknown400Condition := newFailedHealthCheckV1Beta1Condition(clusterv1.UnhealthyNodeConditionV1Beta1Reason, "Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String())`
`394`	`394`	`nodeUnknown400V1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckUnhealthyNodeReason, "Health check failed: Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String())`
`395`	`395`
`396`	`396`	`// Target for when a node is healthy`
`@@ -425,7 +425,7 @@ func TestHealthCheckTargets(t *testing.T) {`
`425`	`425`	`}`
`426`	`426`	`machineUnhealthy400Condition := newFailedHealthCheckV1Beta1Condition(`
`427`	`427`	`clusterv1.UnhealthyMachineConditionV1Beta1Reason,`
`428`		`- "Condition EtcdPodHealthy on machine is reporting status False for more than %s",`
	`428`	`+ "Condition EtcdPodHealthy on Machine is reporting status False for more than %s",`
`429`	`429`	`(time.Duration(timeoutForUnhealthyMachineConditions) * time.Second).String(),`
`430`	`430`	`)`
`431`	`431`	`machineUnhealthy400V1Beta2Condition := newFailedHealthCheckCondition(`
Original file line number	Diff line number	Diff line change
`@@ -301,7 +301,7 @@ func TestMachineHealthCheckUnhealthyMachineConditions(t *testing.T) {`
`301`	`301`	`expectErr: false,`
`302`	`302`	`},`
`303`	`303`	`{`
`304`		`- name: "do not fail if the UnhealthyMachineCondition array is nil",`
	`304`	`+ name: "do not fail if the UnhealthyMachineCondition array is empty",`
`305`	`305`	`unhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{},`
`306`	`306`	`expectErr: false,`
`307`	`307`	`},`