Experiment

fabriziopandini · fabriziopandini · commit 0415ae3980d5 · 2025-10-17T18:08:40.000+02:00
diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go
@@ -89,9 +89,6 @@ func (t *healthCheckTarget) nodeName() string {
 // Returns true if remediation is needed, and a duration indicating when to recheck if remediation
 // is not immediately required. The target should be requeued after this duration.
 func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) (bool, time.Duration) {
-	var nextCheckTimes []time.Duration
-	now := time.Now()
-
 	if annotations.HasRemediateMachine(t.Machine) {
 		v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.HasRemediateMachineAnnotationV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Marked for remediation via remediate-machine annotation")
 		logger.V(3).Info("Target is marked for remediation via remediate-machine annotation")
@@ -120,12 +117,47 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
 		return false, 0
 	}
 
-	// Collect all unhealthy conditions (both node and machine) to provide comprehensive status
-	// Always evaluate machine conditions regardless of node state for consistent behavior
-	var (
-		unhealthyNodeMessages    []string
-		unhealthyMachineMessages []string
-	)
+	// Check machine conditions
+	unhealthyMachineMessages, nextMachineCheck := t.machineChecks(logger)
+
+	// Check node conditions
+	nodeConditionReason, nodeV1beta1ConditionReason, unhealthyNodeMessages, nextNodeCheck := t.nodeChecks(logger, timeoutForMachineToHaveNode)
+
+	// Combine results
+	if len(unhealthyMachineMessages) == 0 && len(unhealthyNodeMessages) == 0 {
+		return false, minDuration([]time.Duration{nextMachineCheck, nextNodeCheck})
+	}
+
+	reason := nodeConditionReason
+	v1beta1Reason := nodeV1beta1ConditionReason
+	if len(unhealthyMachineMessages) > 0 {
+		reason = clusterv1.MachineHealthCheckUnhealthyMachineReason
+		v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
+	}
+	// Fix condition Message
+	allMessages := append(unhealthyMachineMessages, unhealthyNodeMessages...)
+
+	conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; "))
+	conditions.Set(t.Machine, metav1.Condition{
+		Type:    clusterv1.MachineHealthCheckSucceededCondition,
+		Status:  metav1.ConditionFalse,
+		Reason:  reason,
+		Message: conditionMessage,
+	})
+
+	if len(unhealthyMachineMessages) > 0 {
+		v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; "))
+	} else {
+		v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "")
+	}
+
+	return true, time.Duration(0)
+}
+
+func (t *healthCheckTarget) machineChecks(logger logr.Logger) ([]string, time.Duration) {
+	var unhealthyMachineMessages []string
+	var nextCheckTimes []time.Duration
+	now := time.Now()
 
 	// Always check machine conditions first, regardless of node state
 	for _, c := range t.MHC.Spec.Checks.UnhealthyMachineConditions {
@@ -154,65 +186,29 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
 		}
 	}
 
+	if len(unhealthyMachineMessages) > 0 {
+		return unhealthyMachineMessages, time.Duration(0)
+	}
+	return nil, minDuration(nextCheckTimes)
+}
+
+func (t *healthCheckTarget) nodeChecks(logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) (string, string, []string, time.Duration) {
+	var nextCheckTimes []time.Duration
+	now := time.Now()
+
 	// Machine has Status.NodeRef set, although we couldn't find the node in the workload cluster.
 	if t.nodeMissing {
 		logger.V(3).Info("Target is unhealthy: node is missing")
-
-		// Always merge node missing message with any unhealthy machine conditions
 		nodeMissingMessage := fmt.Sprintf("Node %s has been deleted", t.Machine.Status.NodeRef.Name)
-		allMessages := append([]string{nodeMissingMessage}, unhealthyMachineMessages...)
-
-		reason := clusterv1.MachineHealthCheckNodeDeletedReason
-		v1beta1Reason := clusterv1.NodeNotFoundV1Beta1Reason
-		if len(unhealthyMachineMessages) > 0 {
-			reason = clusterv1.MachineHealthCheckUnhealthyMachineReason
-			v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
-		}
-
-		// For v1beta2 we use a single-line message prefixed with "Health check failed: "
-		conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; "))
-		conditions.Set(t.Machine, metav1.Condition{
-			Type:    clusterv1.MachineHealthCheckSucceededCondition,
-			Status:  metav1.ConditionFalse,
-			Reason:  reason,
-			Message: conditionMessage,
-		})
-
-		// For v1beta1 keep the existing format
-		if len(unhealthyMachineMessages) > 0 {
-			v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; "))
-		} else {
-			v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "")
-		}
-		return true, time.Duration(0)
+		return clusterv1.MachineHealthCheckNodeDeletedReason, clusterv1.NodeNotFoundV1Beta1Reason, []string{nodeMissingMessage}, time.Duration(0)
 	}
 
 	// the node has not been set yet
 	if t.Node == nil {
-		// Check if we already have unhealthy machine conditions that should trigger remediation
-		if len(unhealthyMachineMessages) > 0 {
-			reason := clusterv1.MachineHealthCheckUnhealthyMachineReason
-			v1beta1Reason := clusterv1.UnhealthyMachineConditionV1Beta1Reason
-
-			// For v1beta2 we use a single-line message prefixed with "Health check failed: "
-			conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(unhealthyMachineMessages, "; "))
-			conditions.Set(t.Machine, metav1.Condition{
-				Type:    clusterv1.MachineHealthCheckSucceededCondition,
-				Status:  metav1.ConditionFalse,
-				Reason:  reason,
-				Message: conditionMessage,
-			})
-
-			// For v1beta1 keep the existing semicolon-separated format (no prefix).
-			v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(unhealthyMachineMessages, "; "))
-
-			return true, time.Duration(0)
-		}
-
 		if timeoutForMachineToHaveNode == disabledNodeStartupTimeout {
 			// Startup timeout is disabled so no need to go any further.
 			// No node yet to check conditions, can return early here.
-			return false, minDuration(nextCheckTimes)
+			return "", "", nil, time.Duration(0)
 		}
 
 		controlPlaneInitialized := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ClusterControlPlaneInitializedCondition)
@@ -241,47 +237,18 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
 
 		timeoutDuration := timeoutForMachineToHaveNode.Duration
 		if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) {
-			// Node startup timeout - merge with any unhealthy machine conditions
-			nodeTimeoutMessage := fmt.Sprintf("Node failed to report startup in %s", timeoutDuration)
-			allMessages := append([]string{nodeTimeoutMessage}, unhealthyMachineMessages...)
-
-			reason := clusterv1.MachineHealthCheckNodeStartupTimeoutReason
-			v1beta1Reason := clusterv1.NodeStartupTimeoutV1Beta1Reason
-			if len(unhealthyMachineMessages) > 0 {
-				reason = clusterv1.MachineHealthCheckUnhealthyMachineReason
-				v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
-			}
-
 			logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutDuration)
-
-			// For v1beta2 we use a single-line message prefixed with "Health check failed: "
-			conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; "))
-			conditions.Set(t.Machine, metav1.Condition{
-				Type:    clusterv1.MachineHealthCheckSucceededCondition,
-				Status:  metav1.ConditionFalse,
-				Reason:  reason,
-				Message: conditionMessage,
-			})
-
-			// For v1beta1 keep the existing format
-			if len(unhealthyMachineMessages) > 0 {
-				v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; "))
-			} else {
-				v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutDuration)
-			}
-			return true, time.Duration(0)
+			nodeTimeoutMessage := fmt.Sprintf("Node failed to report startup in %s", timeoutDuration)
+			return clusterv1.MachineHealthCheckNodeStartupTimeoutReason, clusterv1.NodeStartupTimeoutV1Beta1Reason, []string{nodeTimeoutMessage}, time.Duration(0)
 		}
 
 		durationUnhealthy := now.Sub(comparisonTime)
 		nextCheck := timeoutDuration - durationUnhealthy + time.Second
-		if nextCheck > 0 {
-			nextCheckTimes = append(nextCheckTimes, nextCheck)
-		}
-
-		return false, minDuration(nextCheckTimes)
+		return "", "", nil, nextCheck
 	}
 
 	// check node conditions (only when node is available)
+	var unhealthyNodeMessages []string
 	for _, c := range t.MHC.Spec.Checks.UnhealthyNodeConditions {
 		nodeCondition := getNodeCondition(t.Node, c.Type)
 
@@ -308,33 +275,10 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
 		}
 	}
 
-	if len(unhealthyNodeMessages) > 0 || len(unhealthyMachineMessages) > 0 {
-		reason := clusterv1.MachineHealthCheckUnhealthyNodeReason
-		v1beta1Reason := clusterv1.UnhealthyNodeConditionV1Beta1Reason
-		if len(unhealthyMachineMessages) > 0 {
-			reason = clusterv1.MachineHealthCheckUnhealthyMachineReason
-			v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
-		}
-
-		// Combine all messages into a single comprehensive message
-		allMessages := append(unhealthyNodeMessages, unhealthyMachineMessages...)
-		// For v1beta2 we use a single-line message prefixed with "Health check failed: "
-		// for compatibility with existing tests and other condition messages.
-		conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; "))
-		conditions.Set(t.Machine, metav1.Condition{
-			Type:    clusterv1.MachineHealthCheckSucceededCondition,
-			Status:  metav1.ConditionFalse,
-			Reason:  reason,
-			Message: conditionMessage,
-		})
-
-		// For v1beta1 keep the existing semicolon-separated format (no prefix).
-		v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; "))
-
-		return true, time.Duration(0)
+	if len(unhealthyNodeMessages) > 0 {
+		return clusterv1.MachineHealthCheckUnhealthyNodeReason, clusterv1.UnhealthyNodeConditionV1Beta1Reason, unhealthyNodeMessages, time.Duration(0)
 	}
-
-	return false, minDuration(nextCheckTimes)
+	return "", "", nil, minDuration(nextCheckTimes)
 }
 
 // getTargetsFromMHC uses the MachineHealthCheck's selector to fetch machines