Skip to content

Commit 0415ae3

Browse files
Experiment
1 parent 2c1ca03 commit 0415ae3

File tree

1 file changed

+60
-116
lines changed

1 file changed

+60
-116
lines changed

internal/controllers/machinehealthcheck/machinehealthcheck_targets.go

Lines changed: 60 additions & 116 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,6 @@ func (t *healthCheckTarget) nodeName() string {
8989
// Returns true if remediation is needed, and a duration indicating when to recheck if remediation
9090
// is not immediately required. The target should be requeued after this duration.
9191
func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) (bool, time.Duration) {
92-
var nextCheckTimes []time.Duration
93-
now := time.Now()
94-
9592
if annotations.HasRemediateMachine(t.Machine) {
9693
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.HasRemediateMachineAnnotationV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Marked for remediation via remediate-machine annotation")
9794
logger.V(3).Info("Target is marked for remediation via remediate-machine annotation")
@@ -120,12 +117,47 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
120117
return false, 0
121118
}
122119

123-
// Collect all unhealthy conditions (both node and machine) to provide comprehensive status
124-
// Always evaluate machine conditions regardless of node state for consistent behavior
125-
var (
126-
unhealthyNodeMessages []string
127-
unhealthyMachineMessages []string
128-
)
120+
// Check machine conditions
121+
unhealthyMachineMessages, nextMachineCheck := t.machineChecks(logger)
122+
123+
// Check node conditions
124+
nodeConditionReason, nodeV1beta1ConditionReason, unhealthyNodeMessages, nextNodeCheck := t.nodeChecks(logger, timeoutForMachineToHaveNode)
125+
126+
// Combine results
127+
if len(unhealthyMachineMessages) == 0 && len(unhealthyNodeMessages) == 0 {
128+
return false, minDuration([]time.Duration{nextMachineCheck, nextNodeCheck})
129+
}
130+
131+
reason := nodeConditionReason
132+
v1beta1Reason := nodeV1beta1ConditionReason
133+
if len(unhealthyMachineMessages) > 0 {
134+
reason = clusterv1.MachineHealthCheckUnhealthyMachineReason
135+
v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
136+
}
137+
// Fix condition Message
138+
allMessages := append(unhealthyMachineMessages, unhealthyNodeMessages...)
139+
140+
conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; "))
141+
conditions.Set(t.Machine, metav1.Condition{
142+
Type: clusterv1.MachineHealthCheckSucceededCondition,
143+
Status: metav1.ConditionFalse,
144+
Reason: reason,
145+
Message: conditionMessage,
146+
})
147+
148+
if len(unhealthyMachineMessages) > 0 {
149+
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; "))
150+
} else {
151+
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "")
152+
}
153+
154+
return true, time.Duration(0)
155+
}
156+
157+
func (t *healthCheckTarget) machineChecks(logger logr.Logger) ([]string, time.Duration) {
158+
var unhealthyMachineMessages []string
159+
var nextCheckTimes []time.Duration
160+
now := time.Now()
129161

130162
// Always check machine conditions first, regardless of node state
131163
for _, c := range t.MHC.Spec.Checks.UnhealthyMachineConditions {
@@ -154,65 +186,29 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
154186
}
155187
}
156188

189+
if len(unhealthyMachineMessages) > 0 {
190+
return unhealthyMachineMessages, time.Duration(0)
191+
}
192+
return nil, minDuration(nextCheckTimes)
193+
}
194+
195+
func (t *healthCheckTarget) nodeChecks(logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) (string, string, []string, time.Duration) {
196+
var nextCheckTimes []time.Duration
197+
now := time.Now()
198+
157199
// Machine has Status.NodeRef set, although we couldn't find the node in the workload cluster.
158200
if t.nodeMissing {
159201
logger.V(3).Info("Target is unhealthy: node is missing")
160-
161-
// Always merge node missing message with any unhealthy machine conditions
162202
nodeMissingMessage := fmt.Sprintf("Node %s has been deleted", t.Machine.Status.NodeRef.Name)
163-
allMessages := append([]string{nodeMissingMessage}, unhealthyMachineMessages...)
164-
165-
reason := clusterv1.MachineHealthCheckNodeDeletedReason
166-
v1beta1Reason := clusterv1.NodeNotFoundV1Beta1Reason
167-
if len(unhealthyMachineMessages) > 0 {
168-
reason = clusterv1.MachineHealthCheckUnhealthyMachineReason
169-
v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
170-
}
171-
172-
// For v1beta2 we use a single-line message prefixed with "Health check failed: "
173-
conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; "))
174-
conditions.Set(t.Machine, metav1.Condition{
175-
Type: clusterv1.MachineHealthCheckSucceededCondition,
176-
Status: metav1.ConditionFalse,
177-
Reason: reason,
178-
Message: conditionMessage,
179-
})
180-
181-
// For v1beta1 keep the existing format
182-
if len(unhealthyMachineMessages) > 0 {
183-
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; "))
184-
} else {
185-
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "")
186-
}
187-
return true, time.Duration(0)
203+
return clusterv1.MachineHealthCheckNodeDeletedReason, clusterv1.NodeNotFoundV1Beta1Reason, []string{nodeMissingMessage}, time.Duration(0)
188204
}
189205

190206
// the node has not been set yet
191207
if t.Node == nil {
192-
// Check if we already have unhealthy machine conditions that should trigger remediation
193-
if len(unhealthyMachineMessages) > 0 {
194-
reason := clusterv1.MachineHealthCheckUnhealthyMachineReason
195-
v1beta1Reason := clusterv1.UnhealthyMachineConditionV1Beta1Reason
196-
197-
// For v1beta2 we use a single-line message prefixed with "Health check failed: "
198-
conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(unhealthyMachineMessages, "; "))
199-
conditions.Set(t.Machine, metav1.Condition{
200-
Type: clusterv1.MachineHealthCheckSucceededCondition,
201-
Status: metav1.ConditionFalse,
202-
Reason: reason,
203-
Message: conditionMessage,
204-
})
205-
206-
// For v1beta1 keep the existing semicolon-separated format (no prefix).
207-
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(unhealthyMachineMessages, "; "))
208-
209-
return true, time.Duration(0)
210-
}
211-
212208
if timeoutForMachineToHaveNode == disabledNodeStartupTimeout {
213209
// Startup timeout is disabled so no need to go any further.
214210
// No node yet to check conditions, can return early here.
215-
return false, minDuration(nextCheckTimes)
211+
return "", "", nil, time.Duration(0)
216212
}
217213

218214
controlPlaneInitialized := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ClusterControlPlaneInitializedCondition)
@@ -241,47 +237,18 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
241237

242238
timeoutDuration := timeoutForMachineToHaveNode.Duration
243239
if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) {
244-
// Node startup timeout - merge with any unhealthy machine conditions
245-
nodeTimeoutMessage := fmt.Sprintf("Node failed to report startup in %s", timeoutDuration)
246-
allMessages := append([]string{nodeTimeoutMessage}, unhealthyMachineMessages...)
247-
248-
reason := clusterv1.MachineHealthCheckNodeStartupTimeoutReason
249-
v1beta1Reason := clusterv1.NodeStartupTimeoutV1Beta1Reason
250-
if len(unhealthyMachineMessages) > 0 {
251-
reason = clusterv1.MachineHealthCheckUnhealthyMachineReason
252-
v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
253-
}
254-
255240
logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutDuration)
256-
257-
// For v1beta2 we use a single-line message prefixed with "Health check failed: "
258-
conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; "))
259-
conditions.Set(t.Machine, metav1.Condition{
260-
Type: clusterv1.MachineHealthCheckSucceededCondition,
261-
Status: metav1.ConditionFalse,
262-
Reason: reason,
263-
Message: conditionMessage,
264-
})
265-
266-
// For v1beta1 keep the existing format
267-
if len(unhealthyMachineMessages) > 0 {
268-
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; "))
269-
} else {
270-
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutDuration)
271-
}
272-
return true, time.Duration(0)
241+
nodeTimeoutMessage := fmt.Sprintf("Node failed to report startup in %s", timeoutDuration)
242+
return clusterv1.MachineHealthCheckNodeStartupTimeoutReason, clusterv1.NodeStartupTimeoutV1Beta1Reason, []string{nodeTimeoutMessage}, time.Duration(0)
273243
}
274244

275245
durationUnhealthy := now.Sub(comparisonTime)
276246
nextCheck := timeoutDuration - durationUnhealthy + time.Second
277-
if nextCheck > 0 {
278-
nextCheckTimes = append(nextCheckTimes, nextCheck)
279-
}
280-
281-
return false, minDuration(nextCheckTimes)
247+
return "", "", nil, nextCheck
282248
}
283249

284250
// check node conditions (only when node is available)
251+
var unhealthyNodeMessages []string
285252
for _, c := range t.MHC.Spec.Checks.UnhealthyNodeConditions {
286253
nodeCondition := getNodeCondition(t.Node, c.Type)
287254

@@ -308,33 +275,10 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
308275
}
309276
}
310277

311-
if len(unhealthyNodeMessages) > 0 || len(unhealthyMachineMessages) > 0 {
312-
reason := clusterv1.MachineHealthCheckUnhealthyNodeReason
313-
v1beta1Reason := clusterv1.UnhealthyNodeConditionV1Beta1Reason
314-
if len(unhealthyMachineMessages) > 0 {
315-
reason = clusterv1.MachineHealthCheckUnhealthyMachineReason
316-
v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
317-
}
318-
319-
// Combine all messages into a single comprehensive message
320-
allMessages := append(unhealthyNodeMessages, unhealthyMachineMessages...)
321-
// For v1beta2 we use a single-line message prefixed with "Health check failed: "
322-
// for compatibility with existing tests and other condition messages.
323-
conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; "))
324-
conditions.Set(t.Machine, metav1.Condition{
325-
Type: clusterv1.MachineHealthCheckSucceededCondition,
326-
Status: metav1.ConditionFalse,
327-
Reason: reason,
328-
Message: conditionMessage,
329-
})
330-
331-
// For v1beta1 keep the existing semicolon-separated format (no prefix).
332-
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; "))
333-
334-
return true, time.Duration(0)
278+
if len(unhealthyNodeMessages) > 0 {
279+
return clusterv1.MachineHealthCheckUnhealthyNodeReason, clusterv1.UnhealthyNodeConditionV1Beta1Reason, unhealthyNodeMessages, time.Duration(0)
335280
}
336-
337-
return false, minDuration(nextCheckTimes)
281+
return "", "", nil, minDuration(nextCheckTimes)
338282
}
339283

340284
// getTargetsFromMHC uses the MachineHealthCheck's selector to fetch machines

0 commit comments

Comments
 (0)