Skip to content

Commit f96b742

Browse files
Address review comments: rework node and machine checks in needsRemediation() method
If both a node condition and machine condition are unhealthy, pick one reason but combine all the messages Signed-off-by: Furkat Gofurov <[email protected]>
1 parent 5ee7d25 commit f96b742

File tree

3 files changed

+69
-25
lines changed

3 files changed

+69
-25
lines changed

internal/controllers/machinehealthcheck/machinehealthcheck_targets.go

Lines changed: 66 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package machinehealthcheck
1919
import (
2020
"context"
2121
"fmt"
22+
"strings"
2223
"time"
2324

2425
"github.com/go-logr/logr"
@@ -181,6 +182,13 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
181182
return false, nextCheck
182183
}
183184

185+
// Collect all unhealthy conditions (both node and machine) to provide comprehensive status
186+
var (
187+
unhealthyMessages []string
188+
unhealthyReasons []string
189+
foundUnhealthyCondition bool
190+
)
191+
184192
// check node conditions
185193
for _, c := range t.MHC.Spec.Checks.UnhealthyNodeConditions {
186194
nodeCondition := getNodeCondition(t.Node, c.Type)
@@ -192,20 +200,15 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
192200
}
193201

194202
// If the node condition has been in the unhealthy state for longer than the
195-
// timeout, return true with no requeue time.
203+
// timeout, mark as unhealthy and collect the message.
196204
timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second
197205

198206
if nodeCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) {
199-
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.UnhealthyNodeConditionV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Condition %s on Node is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String())
200-
logger.V(3).Info("Target is unhealthy: condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String())
201-
202-
conditions.Set(t.Machine, metav1.Condition{
203-
Type: clusterv1.MachineHealthCheckSucceededCondition,
204-
Status: metav1.ConditionFalse,
205-
Reason: clusterv1.MachineHealthCheckUnhealthyNodeReason,
206-
Message: fmt.Sprintf("Health check failed: Condition %s on Node is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()),
207-
})
208-
return true, time.Duration(0)
207+
foundUnhealthyCondition = true
208+
unhealthyMessages = append(unhealthyMessages, fmt.Sprintf("Node condition %s is %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()))
209+
unhealthyReasons = append(unhealthyReasons, "UnhealthyNode")
210+
logger.V(3).Info("Target is unhealthy: node condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String())
211+
continue
209212
}
210213

211214
durationUnhealthy := now.Sub(nodeCondition.LastTransitionTime.Time)
@@ -226,20 +229,15 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
226229
}
227230

228231
// If the machine condition has been in the unhealthy state for longer than the
229-
// timeout, return true with no requeue time.
232+
// timeout, mark as unhealthy and collect the message.
230233
timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second
231234

232235
if machineCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) {
233-
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.UnhealthyMachineConditionV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Condition %s on Machine is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String())
234-
logger.V(3).Info("Target is unhealthy: condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String())
235-
236-
conditions.Set(t.Machine, metav1.Condition{
237-
Type: clusterv1.MachineHealthCheckSucceededCondition,
238-
Status: metav1.ConditionFalse,
239-
Reason: clusterv1.MachineHealthCheckUnhealthyMachineReason,
240-
Message: fmt.Sprintf("Health check failed: Condition %s on Machine is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()),
241-
})
242-
return true, time.Duration(0)
236+
foundUnhealthyCondition = true
237+
unhealthyMessages = append(unhealthyMessages, fmt.Sprintf("Machine condition %s is %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()))
238+
unhealthyReasons = append(unhealthyReasons, "UnhealthyMachine")
239+
logger.V(3).Info("Target is unhealthy: machine condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String())
240+
continue
243241
}
244242

245243
durationUnhealthy := now.Sub(machineCondition.LastTransitionTime.Time)
@@ -249,6 +247,52 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi
249247
}
250248
}
251249

250+
// If any unhealthy conditions were found, set the combined status
251+
if foundUnhealthyCondition {
252+
// Determine the primary reason based on a consistent priority order:
253+
// 1. If both node and machine conditions are present, use a combined reason
254+
// 2. Otherwise use the specific reason for the type that failed
255+
var primaryReason, v1beta1Reason string
256+
if len(unhealthyReasons) > 0 {
257+
// Check if we have both node and machine reasons
258+
hasNodeReason := false
259+
hasMachineReason := false
260+
for _, reason := range unhealthyReasons {
261+
switch reason {
262+
case "UnhealthyNode":
263+
hasNodeReason = true
264+
case "UnhealthyMachine":
265+
hasMachineReason = true
266+
}
267+
}
268+
269+
if hasNodeReason && hasMachineReason {
270+
// Both types of conditions are unhealthy - use machine reason but indicate it's combined
271+
primaryReason = clusterv1.MachineHealthCheckUnhealthyMachineReason
272+
v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
273+
} else if hasMachineReason {
274+
primaryReason = clusterv1.MachineHealthCheckUnhealthyMachineReason
275+
v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason
276+
} else if hasNodeReason {
277+
primaryReason = clusterv1.MachineHealthCheckUnhealthyNodeReason
278+
v1beta1Reason = clusterv1.UnhealthyNodeConditionV1Beta1Reason
279+
}
280+
}
281+
282+
// Combine all messages into a single comprehensive message
283+
combinedMessage := fmt.Sprintf("Health check failed: %s", strings.Join(unhealthyMessages, "; "))
284+
285+
v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", combinedMessage)
286+
287+
conditions.Set(t.Machine, metav1.Condition{
288+
Type: clusterv1.MachineHealthCheckSucceededCondition,
289+
Status: metav1.ConditionFalse,
290+
Reason: primaryReason,
291+
Message: combinedMessage,
292+
})
293+
return true, time.Duration(0)
294+
}
295+
252296
return false, minDuration(nextCheckTimes)
253297
}
254298

internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ func TestHealthCheckTargets(t *testing.T) {
390390
Node: testNodeUnknown400,
391391
nodeMissing: false,
392392
}
393-
nodeUnknown400Condition := newFailedHealthCheckV1Beta1Condition(clusterv1.UnhealthyNodeConditionV1Beta1Reason, "Condition Ready on node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String())
393+
nodeUnknown400Condition := newFailedHealthCheckV1Beta1Condition(clusterv1.UnhealthyNodeConditionV1Beta1Reason, "Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String())
394394
nodeUnknown400V1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckUnhealthyNodeReason, "Health check failed: Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String())
395395

396396
// Target for when a node is healthy
@@ -425,7 +425,7 @@ func TestHealthCheckTargets(t *testing.T) {
425425
}
426426
machineUnhealthy400Condition := newFailedHealthCheckV1Beta1Condition(
427427
clusterv1.UnhealthyMachineConditionV1Beta1Reason,
428-
"Condition EtcdPodHealthy on machine is reporting status False for more than %s",
428+
"Condition EtcdPodHealthy on Machine is reporting status False for more than %s",
429429
(time.Duration(timeoutForUnhealthyMachineConditions) * time.Second).String(),
430430
)
431431
machineUnhealthy400V1Beta2Condition := newFailedHealthCheckCondition(

internal/webhooks/machinehealthcheck_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -301,7 +301,7 @@ func TestMachineHealthCheckUnhealthyMachineConditions(t *testing.T) {
301301
expectErr: false,
302302
},
303303
{
304-
name: "do not fail if the UnhealthyMachineCondition array is nil",
304+
name: "do not fail if the UnhealthyMachineCondition array is empty",
305305
unhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{},
306306
expectErr: false,
307307
},

0 commit comments

Comments
 (0)