@@ -89,9 +89,6 @@ func (t *healthCheckTarget) nodeName() string {
8989// Returns true if remediation is needed, and a duration indicating when to recheck if remediation
9090// is not immediately required. The target should be requeued after this duration.
9191func (t * healthCheckTarget ) needsRemediation (logger logr.Logger , timeoutForMachineToHaveNode metav1.Duration ) (bool , time.Duration ) {
92- var nextCheckTimes []time.Duration
93- now := time .Now ()
94-
9592 if annotations .HasRemediateMachine (t .Machine ) {
9693 v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , clusterv1 .HasRemediateMachineAnnotationV1Beta1Reason , clusterv1 .ConditionSeverityWarning , "Marked for remediation via remediate-machine annotation" )
9794 logger .V (3 ).Info ("Target is marked for remediation via remediate-machine annotation" )
@@ -120,12 +117,47 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
120117 return false , 0
121118 }
122119
123- // Collect all unhealthy conditions (both node and machine) to provide comprehensive status
124- // Always evaluate machine conditions regardless of node state for consistent behavior
125- var (
126- unhealthyNodeMessages []string
127- unhealthyMachineMessages []string
128- )
120+ // Check machine conditions
121+ unhealthyMachineMessages , nextMachineCheck := t .machineChecks (logger )
122+
123+ // Check node conditions
124+ nodeConditionReason , nodeV1beta1ConditionReason , unhealthyNodeMessages , nextNodeCheck := t .nodeChecks (logger , timeoutForMachineToHaveNode )
125+
126+ // Combine results
127+ if len (unhealthyMachineMessages ) == 0 && len (unhealthyNodeMessages ) == 0 {
128+ return false , minDuration ([]time.Duration {nextMachineCheck , nextNodeCheck })
129+ }
130+
131+ reason := nodeConditionReason
132+ v1beta1Reason := nodeV1beta1ConditionReason
133+ if len (unhealthyMachineMessages ) > 0 {
134+ reason = clusterv1 .MachineHealthCheckUnhealthyMachineReason
135+ v1beta1Reason = clusterv1 .UnhealthyMachineConditionV1Beta1Reason
136+ }
137+ // Fix condition Message
138+ allMessages := append (unhealthyMachineMessages , unhealthyNodeMessages ... )
139+
140+ conditionMessage := fmt .Sprintf ("Health check failed: %s" , strings .Join (allMessages , "; " ))
141+ conditions .Set (t .Machine , metav1.Condition {
142+ Type : clusterv1 .MachineHealthCheckSucceededCondition ,
143+ Status : metav1 .ConditionFalse ,
144+ Reason : reason ,
145+ Message : conditionMessage ,
146+ })
147+
148+ if len (unhealthyMachineMessages ) > 0 {
149+ v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , v1beta1Reason , clusterv1 .ConditionSeverityWarning , "%s" , strings .Join (allMessages , "; " ))
150+ } else {
151+ v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , v1beta1Reason , clusterv1 .ConditionSeverityWarning , "" )
152+ }
153+
154+ return true , time .Duration (0 )
155+ }
156+
157+ func (t * healthCheckTarget ) machineChecks (logger logr.Logger ) ([]string , time.Duration ) {
158+ var unhealthyMachineMessages []string
159+ var nextCheckTimes []time.Duration
160+ now := time .Now ()
129161
130162 // Always check machine conditions first, regardless of node state
131163 for _ , c := range t .MHC .Spec .Checks .UnhealthyMachineConditions {
@@ -154,65 +186,29 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
154186 }
155187 }
156188
189+ if len (unhealthyMachineMessages ) > 0 {
190+ return unhealthyMachineMessages , time .Duration (0 )
191+ }
192+ return nil , minDuration (nextCheckTimes )
193+ }
194+
195+ func (t * healthCheckTarget ) nodeChecks (logger logr.Logger , timeoutForMachineToHaveNode metav1.Duration ) (string , string , []string , time.Duration ) {
196+ var nextCheckTimes []time.Duration
197+ now := time .Now ()
198+
157199 // Machine has Status.NodeRef set, although we couldn't find the node in the workload cluster.
158200 if t .nodeMissing {
159201 logger .V (3 ).Info ("Target is unhealthy: node is missing" )
160-
161- // Always merge node missing message with any unhealthy machine conditions
162202 nodeMissingMessage := fmt .Sprintf ("Node %s has been deleted" , t .Machine .Status .NodeRef .Name )
163- allMessages := append ([]string {nodeMissingMessage }, unhealthyMachineMessages ... )
164-
165- reason := clusterv1 .MachineHealthCheckNodeDeletedReason
166- v1beta1Reason := clusterv1 .NodeNotFoundV1Beta1Reason
167- if len (unhealthyMachineMessages ) > 0 {
168- reason = clusterv1 .MachineHealthCheckUnhealthyMachineReason
169- v1beta1Reason = clusterv1 .UnhealthyMachineConditionV1Beta1Reason
170- }
171-
172- // For v1beta2 we use a single-line message prefixed with "Health check failed: "
173- conditionMessage := fmt .Sprintf ("Health check failed: %s" , strings .Join (allMessages , "; " ))
174- conditions .Set (t .Machine , metav1.Condition {
175- Type : clusterv1 .MachineHealthCheckSucceededCondition ,
176- Status : metav1 .ConditionFalse ,
177- Reason : reason ,
178- Message : conditionMessage ,
179- })
180-
181- // For v1beta1 keep the existing format
182- if len (unhealthyMachineMessages ) > 0 {
183- v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , v1beta1Reason , clusterv1 .ConditionSeverityWarning , "%s" , strings .Join (allMessages , "; " ))
184- } else {
185- v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , v1beta1Reason , clusterv1 .ConditionSeverityWarning , "" )
186- }
187- return true , time .Duration (0 )
203+ return clusterv1 .MachineHealthCheckNodeDeletedReason , clusterv1 .NodeNotFoundV1Beta1Reason , []string {nodeMissingMessage }, time .Duration (0 )
188204 }
189205
190206 // the node has not been set yet
191207 if t .Node == nil {
192- // Check if we already have unhealthy machine conditions that should trigger remediation
193- if len (unhealthyMachineMessages ) > 0 {
194- reason := clusterv1 .MachineHealthCheckUnhealthyMachineReason
195- v1beta1Reason := clusterv1 .UnhealthyMachineConditionV1Beta1Reason
196-
197- // For v1beta2 we use a single-line message prefixed with "Health check failed: "
198- conditionMessage := fmt .Sprintf ("Health check failed: %s" , strings .Join (unhealthyMachineMessages , "; " ))
199- conditions .Set (t .Machine , metav1.Condition {
200- Type : clusterv1 .MachineHealthCheckSucceededCondition ,
201- Status : metav1 .ConditionFalse ,
202- Reason : reason ,
203- Message : conditionMessage ,
204- })
205-
206- // For v1beta1 keep the existing semicolon-separated format (no prefix).
207- v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , v1beta1Reason , clusterv1 .ConditionSeverityWarning , "%s" , strings .Join (unhealthyMachineMessages , "; " ))
208-
209- return true , time .Duration (0 )
210- }
211-
212208 if timeoutForMachineToHaveNode == disabledNodeStartupTimeout {
213209 // Startup timeout is disabled so no need to go any further.
214210 // No node yet to check conditions, can return early here.
215- return false , minDuration ( nextCheckTimes )
211+ return "" , "" , nil , time . Duration ( 0 )
216212 }
217213
218214 controlPlaneInitialized := conditions .GetLastTransitionTime (t .Cluster , clusterv1 .ClusterControlPlaneInitializedCondition )
@@ -241,47 +237,18 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
241237
242238 timeoutDuration := timeoutForMachineToHaveNode .Duration
243239 if comparisonTime .Add (timeoutForMachineToHaveNode .Duration ).Before (now ) {
244- // Node startup timeout - merge with any unhealthy machine conditions
245- nodeTimeoutMessage := fmt .Sprintf ("Node failed to report startup in %s" , timeoutDuration )
246- allMessages := append ([]string {nodeTimeoutMessage }, unhealthyMachineMessages ... )
247-
248- reason := clusterv1 .MachineHealthCheckNodeStartupTimeoutReason
249- v1beta1Reason := clusterv1 .NodeStartupTimeoutV1Beta1Reason
250- if len (unhealthyMachineMessages ) > 0 {
251- reason = clusterv1 .MachineHealthCheckUnhealthyMachineReason
252- v1beta1Reason = clusterv1 .UnhealthyMachineConditionV1Beta1Reason
253- }
254-
255240 logger .V (3 ).Info ("Target is unhealthy: machine has no node" , "duration" , timeoutDuration )
256-
257- // For v1beta2 we use a single-line message prefixed with "Health check failed: "
258- conditionMessage := fmt .Sprintf ("Health check failed: %s" , strings .Join (allMessages , "; " ))
259- conditions .Set (t .Machine , metav1.Condition {
260- Type : clusterv1 .MachineHealthCheckSucceededCondition ,
261- Status : metav1 .ConditionFalse ,
262- Reason : reason ,
263- Message : conditionMessage ,
264- })
265-
266- // For v1beta1 keep the existing format
267- if len (unhealthyMachineMessages ) > 0 {
268- v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , v1beta1Reason , clusterv1 .ConditionSeverityWarning , "%s" , strings .Join (allMessages , "; " ))
269- } else {
270- v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , v1beta1Reason , clusterv1 .ConditionSeverityWarning , "Node failed to report startup in %s" , timeoutDuration )
271- }
272- return true , time .Duration (0 )
241+ nodeTimeoutMessage := fmt .Sprintf ("Node failed to report startup in %s" , timeoutDuration )
242+ return clusterv1 .MachineHealthCheckNodeStartupTimeoutReason , clusterv1 .NodeStartupTimeoutV1Beta1Reason , []string {nodeTimeoutMessage }, time .Duration (0 )
273243 }
274244
275245 durationUnhealthy := now .Sub (comparisonTime )
276246 nextCheck := timeoutDuration - durationUnhealthy + time .Second
277- if nextCheck > 0 {
278- nextCheckTimes = append (nextCheckTimes , nextCheck )
279- }
280-
281- return false , minDuration (nextCheckTimes )
247+ return "" , "" , nil , nextCheck
282248 }
283249
284250 // check node conditions (only when node is available)
251+ var unhealthyNodeMessages []string
285252 for _ , c := range t .MHC .Spec .Checks .UnhealthyNodeConditions {
286253 nodeCondition := getNodeCondition (t .Node , c .Type )
287254
@@ -308,33 +275,10 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
308275 }
309276 }
310277
311- if len (unhealthyNodeMessages ) > 0 || len (unhealthyMachineMessages ) > 0 {
312- reason := clusterv1 .MachineHealthCheckUnhealthyNodeReason
313- v1beta1Reason := clusterv1 .UnhealthyNodeConditionV1Beta1Reason
314- if len (unhealthyMachineMessages ) > 0 {
315- reason = clusterv1 .MachineHealthCheckUnhealthyMachineReason
316- v1beta1Reason = clusterv1 .UnhealthyMachineConditionV1Beta1Reason
317- }
318-
319- // Combine all messages into a single comprehensive message
320- allMessages := append (unhealthyNodeMessages , unhealthyMachineMessages ... )
321- // For v1beta2 we use a single-line message prefixed with "Health check failed: "
322- // for compatibility with existing tests and other condition messages.
323- conditionMessage := fmt .Sprintf ("Health check failed: %s" , strings .Join (allMessages , "; " ))
324- conditions .Set (t .Machine , metav1.Condition {
325- Type : clusterv1 .MachineHealthCheckSucceededCondition ,
326- Status : metav1 .ConditionFalse ,
327- Reason : reason ,
328- Message : conditionMessage ,
329- })
330-
331- // For v1beta1 keep the existing semicolon-separated format (no prefix).
332- v1beta1conditions .MarkFalse (t .Machine , clusterv1 .MachineHealthCheckSucceededV1Beta1Condition , v1beta1Reason , clusterv1 .ConditionSeverityWarning , "%s" , strings .Join (allMessages , "; " ))
333-
334- return true , time .Duration (0 )
278+ if len (unhealthyNodeMessages ) > 0 {
279+ return clusterv1 .MachineHealthCheckUnhealthyNodeReason , clusterv1 .UnhealthyNodeConditionV1Beta1Reason , unhealthyNodeMessages , time .Duration (0 )
335280 }
336-
337- return false , minDuration (nextCheckTimes )
281+ return "" , "" , nil , minDuration (nextCheckTimes )
338282}
339283
340284// getTargetsFromMHC uses the MachineHealthCheck's selector to fetch machines
0 commit comments