From e105d189f2e9d7f5d5cd0545a2d27d53348aeae8 Mon Sep 17 00:00:00 2001 From: Furkat Gofurov Date: Tue, 7 Oct 2025 23:54:13 +0300 Subject: [PATCH 1/8] Add support for checking Machine conditions in MachineHealthCheck MachineHealthCheck currently only allows checking Node conditions to validate if a machine is healthy. However, machine conditions capture conditions that do not exist on nodes, for example, control plane node conditions such as EtcdPodHealthy, SchedulerPodHealthy that can indicate if a controlplane machine has been created correctly. Adding support for Machine conditions enables us to perform remediation during control plane upgrades. This PR introduces a new field as part of the MachineHealthCheckChecks: - `UnhealthyMachineConditions` This will mirror the behavior of `UnhealthyNodeConditions` but the MachineHealthCheck controller will instead check the machine conditions. This reimplements and extends earlier work originally proposed in a previous PR 12275. Co-authored-by: Justin Miron Signed-off-by: Furkat Gofurov --- api/core/v1beta1/conversion_test.go | 47 ++++++ api/core/v1beta2/cluster_types.go | 20 +++ api/core/v1beta2/clusterclass_types.go | 20 +++ api/core/v1beta2/machine_types.go | 4 + api/core/v1beta2/machinehealthcheck_types.go | 35 ++++ api/core/v1beta2/v1beta1_condition_consts.go | 3 + api/core/v1beta2/zz_generated.deepcopy.go | 55 +++++++ api/core/v1beta2/zz_generated.openapi.go | 141 +++++++++++++++- .../cluster.x-k8s.io_clusterclasses.yaml | 84 ++++++++++ .../crd/bases/cluster.x-k8s.io_clusters.yaml | 84 ++++++++++ .../cluster.x-k8s.io_machinehealthchecks.yaml | 42 +++++ .../healthchecking.md | 7 + .../cluster-class/write-clusterclass.md | 14 ++ docs/book/src/tasks/using-kustomize.md | 7 + .../desiredstate/desired_state_test.go | 57 ++++++- exp/topology/scope/blueprint.go | 20 ++- exp/topology/scope/blueprint_test.go | 155 ++++++++++++++++++ internal/api/core/v1alpha3/conversion_test.go | 10 ++ internal/api/core/v1alpha4/conversion_test.go | 10 ++ .../machinehealthcheck_controller_test.go | 120 +++++++++++++- .../machinehealthcheck_targets.go | 49 +++++- .../machinehealthcheck_targets_test.go | 114 ++++++++++++- .../topology/cluster/current_state_test.go | 25 +++ .../topology/cluster/reconcile_state_test.go | 35 ++++ internal/webhooks/cluster_test.go | 43 ++++- internal/webhooks/clusterclass_test.go | 79 ++++++++- internal/webhooks/machinehealthcheck_test.go | 114 +++++++++++++ .../cluster-template-kcp-remediation/mhc.yaml | 5 + .../cluster-template-md-remediation/mhc.yaml | 6 +- .../main/clusterclass-quick-start.yaml | 6 +- .../v1.11/clusterclass-quick-start.yaml | 6 +- test/e2e/kcp_remediations.go | 4 + test/framework/machine_helpers.go | 43 +++++ test/framework/machinehealthcheck_helpers.go | 65 ++++++++ .../templates/clusterclass-in-memory.yaml | 14 ++ .../templates/clusterclass-quick-start.yaml | 14 ++ util/test/builder/builders.go | 24 ++- util/test/builder/zz_generated.deepcopy.go | 7 + 38 files changed, 1549 insertions(+), 39 deletions(-) diff --git a/api/core/v1beta1/conversion_test.go b/api/core/v1beta1/conversion_test.go index 10d557048460..a0dbc5f2fdba 100644 --- a/api/core/v1beta1/conversion_test.go +++ b/api/core/v1beta1/conversion_test.go @@ -99,6 +99,7 @@ func ClusterFuzzFuncs(_ runtimeserializer.CodecFactory) []interface{} { hubClusterVariable, hubFailureDomain, hubUnhealthyNodeCondition, + hubUnhealthyMachineCondition, spokeCluster, spokeClusterTopology, spokeObjectReference, @@ -126,6 +127,17 @@ func hubClusterSpec(in *clusterv1.ClusterSpec, c randfill.Continue) { in.ControlPlaneRef.APIGroup = gvk.Group in.ControlPlaneRef.Kind = gvk.Kind } + + // remove MachineHealthCheck.UnhealthyMachineConditions as it does not exist in v1beta1. + if in.Topology.IsDefined() && in.Topology.ControlPlane.HealthCheck.IsDefined() { + in.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = nil + } + + if in.Topology.IsDefined() && len(in.Topology.Workers.MachineDeployments) > 0 { + for i := range in.Topology.Workers.MachineDeployments { + in.Topology.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = nil + } + } } func hubClusterStatus(in *clusterv1.ClusterStatus, c randfill.Continue) { @@ -177,6 +189,14 @@ func hubUnhealthyNodeCondition(in *clusterv1.UnhealthyNodeCondition, c randfill. } } +func hubUnhealthyMachineCondition(in *clusterv1.UnhealthyMachineCondition, c randfill.Continue) { + c.FillNoCustom(in) + + if in.TimeoutSeconds == nil { + in.TimeoutSeconds = ptr.To(int32(0)) // TimeoutSeconds is a required field and nil does not round trip + } +} + func spokeCluster(in *Cluster, c randfill.Continue) { c.FillNoCustom(in) @@ -267,12 +287,14 @@ func spokeClusterVariable(in *ClusterVariable, c randfill.Continue) { func ClusterClassFuncs(_ runtimeserializer.CodecFactory) []interface{} { return []interface{}{ + hubClusterClassSpec, hubClusterClassVariable, hubClusterClassStatusVariableDefinition, hubClusterClassStatus, hubJSONPatch, hubJSONSchemaProps, hubUnhealthyNodeCondition, + hubUnhealthyMachineCondition, spokeClusterClass, spokeObjectReference, spokeClusterClassStatus, @@ -287,6 +309,21 @@ func ClusterClassFuncs(_ runtimeserializer.CodecFactory) []interface{} { } } +func hubClusterClassSpec(in *clusterv1.ClusterClassSpec, c randfill.Continue) { + c.FillNoCustom(in) + + // remove MachineHealthCheck.UnhealthyMachineConditions as it does not exist in v1beta1. + if in.ControlPlane.HealthCheck.IsDefined() && in.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions != nil { + in.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = nil + } + + if len(in.Workers.MachineDeployments) > 0 { + for i := range in.Workers.MachineDeployments { + in.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = nil + } + } +} + func hubClusterClassVariable(in *clusterv1.ClusterClassVariable, c randfill.Continue) { c.FillNoCustom(in) @@ -728,7 +765,9 @@ func spokeMachineDeploymentStatus(in *MachineDeploymentStatus, c randfill.Contin func MachineHealthCheckFuzzFuncs(_ runtimeserializer.CodecFactory) []interface{} { return []interface{}{ hubUnhealthyNodeCondition, + hubUnhealthyMachineCondition, hubMachineHealthCheckStatus, + hubMachineHealthCheckSpec, spokeMachineHealthCheck, spokeMachineHealthCheckSpec, spokeObjectReference, @@ -737,6 +776,14 @@ func MachineHealthCheckFuzzFuncs(_ runtimeserializer.CodecFactory) []interface{} } } +func hubMachineHealthCheckSpec(in *clusterv1.MachineHealthCheckSpec, c randfill.Continue) { + c.FillNoCustom(in) + + if in.Checks.UnhealthyMachineConditions != nil { + in.Checks.UnhealthyMachineConditions = nil + } +} + func hubMachineHealthCheckStatus(in *clusterv1.MachineHealthCheckStatus, c randfill.Continue) { c.FillNoCustom(in) // Drop empty structs with only omit empty fields. diff --git a/api/core/v1beta2/cluster_types.go b/api/core/v1beta2/cluster_types.go index af0969853155..48b697f22d43 100644 --- a/api/core/v1beta2/cluster_types.go +++ b/api/core/v1beta2/cluster_types.go @@ -725,6 +725,16 @@ type ControlPlaneTopologyHealthCheckChecks struct { // +kubebuilder:validation:MinItems=1 // +kubebuilder:validation:MaxItems=100 UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"` + + // unhealthyMachineConditions contains a list of the machine conditions that determine + // whether a machine is considered unhealthy. The conditions are combined in a + // logical OR, i.e. if any of the conditions is met, the machine is unhealthy. + // + // +optional + // +listType=atomic + // +kubebuilder:validation:MinItems=1 + // +kubebuilder:validation:MaxItems=100 + UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"` } // ControlPlaneTopologyHealthCheckRemediation configures if and how remediations are triggered if a control plane Machine is unhealthy. @@ -975,6 +985,16 @@ type MachineDeploymentTopologyHealthCheckChecks struct { // +kubebuilder:validation:MinItems=1 // +kubebuilder:validation:MaxItems=100 UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"` + + // unhealthyMachineConditions contains a list of the machine conditions that determine + // whether a machine is considered unhealthy. The conditions are combined in a + // logical OR, i.e. if any of the conditions is met, the machine is unhealthy. + // + // +optional + // +listType=atomic + // +kubebuilder:validation:MinItems=1 + // +kubebuilder:validation:MaxItems=100 + UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"` } // MachineDeploymentTopologyHealthCheckRemediation configures if and how remediations are triggered if a MachineDeployment Machine is unhealthy. diff --git a/api/core/v1beta2/clusterclass_types.go b/api/core/v1beta2/clusterclass_types.go index af13f9d030cc..12e8cc19c5d4 100644 --- a/api/core/v1beta2/clusterclass_types.go +++ b/api/core/v1beta2/clusterclass_types.go @@ -281,6 +281,16 @@ type ControlPlaneClassHealthCheckChecks struct { // +kubebuilder:validation:MinItems=1 // +kubebuilder:validation:MaxItems=100 UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"` + + // unhealthyMachineConditions contains a list of the machine conditions that determine + // whether a machine is considered unhealthy. The conditions are combined in a + // logical OR, i.e. if any of the conditions is met, the machine is unhealthy. + // + // +optional + // +listType=atomic + // +kubebuilder:validation:MinItems=1 + // +kubebuilder:validation:MaxItems=100 + UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"` } // ControlPlaneClassHealthCheckRemediation configures if and how remediations are triggered if a control plane Machine is unhealthy. @@ -542,6 +552,16 @@ type MachineDeploymentClassHealthCheckChecks struct { // +kubebuilder:validation:MinItems=1 // +kubebuilder:validation:MaxItems=100 UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"` + + // unhealthyMachineConditions contains a list of the machine conditions that determine + // whether a machine is considered unhealthy. The conditions are combined in a + // logical OR, i.e. if any of the conditions is met, the machine is unhealthy. + // + // +optional + // +listType=atomic + // +kubebuilder:validation:MinItems=1 + // +kubebuilder:validation:MaxItems=100 + UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"` } // MachineDeploymentClassHealthCheckRemediation configures if and how remediations are triggered if a MachineDeployment Machine is unhealthy. diff --git a/api/core/v1beta2/machine_types.go b/api/core/v1beta2/machine_types.go index a60f736ba21c..4a7163752fb2 100644 --- a/api/core/v1beta2/machine_types.go +++ b/api/core/v1beta2/machine_types.go @@ -276,6 +276,10 @@ const ( // defined by a MachineHealthCheck object. MachineHealthCheckUnhealthyNodeReason = "UnhealthyNode" + // MachineHealthCheckUnhealthyMachineReason surfaces when the machine does not pass the health checks + // defined by a MachineHealthCheck object. + MachineHealthCheckUnhealthyMachineReason = "UnhealthyMachine" + // MachineHealthCheckNodeStartupTimeoutReason surfaces when the node hosted on the machine does not appear within // the timeout defined by a MachineHealthCheck object. MachineHealthCheckNodeStartupTimeoutReason = "NodeStartupTimeout" diff --git a/api/core/v1beta2/machinehealthcheck_types.go b/api/core/v1beta2/machinehealthcheck_types.go index 9a7e31cae44d..f3e7aa1767f7 100644 --- a/api/core/v1beta2/machinehealthcheck_types.go +++ b/api/core/v1beta2/machinehealthcheck_types.go @@ -111,6 +111,16 @@ type MachineHealthCheckChecks struct { // +kubebuilder:validation:MinItems=1 // +kubebuilder:validation:MaxItems=100 UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"` + + // unhealthyMachineConditions contains a list of the machine conditions that determine + // whether a machine is considered unhealthy. The conditions are combined in a + // logical OR, i.e. if any of the conditions is met, the machine is unhealthy. + // + // +optional + // +listType=atomic + // +kubebuilder:validation:MinItems=1 + // +kubebuilder:validation:MaxItems=100 + UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"` } // MachineHealthCheckRemediation configures if and how remediations are triggered if a Machine is unhealthy. @@ -234,6 +244,31 @@ type UnhealthyNodeCondition struct { TimeoutSeconds *int32 `json:"timeoutSeconds,omitempty"` } +// UnhealthyMachineCondition represents a Machine condition type and value with a timeout +// specified as a duration. When the named condition has been in the given +// status for at least the timeout value, a machine is considered unhealthy. +type UnhealthyMachineCondition struct { + // type of Machine condition + // +kubebuilder:validation:Type=string + // +kubebuilder:validation:MinLength=1 + // +kubebuilder:validation:MaxLength=316 + // +required + Type string `json:"type,omitempty"` + + // status of the condition, one of True, False, Unknown. + // +required + // +kubebuilder:validation:Enum=True;False;Unknown + Status metav1.ConditionStatus `json:"status,omitempty"` + + // timeoutSeconds is the duration that a machine must be in a given status for, + // after which the machine is considered unhealthy. + // For example, with a value of "1h", the machine must match the status + // for at least 1 hour before being considered unhealthy. + // +required + // +kubebuilder:validation:Minimum=0 + TimeoutSeconds *int32 `json:"timeoutSeconds,omitempty"` +} + // MachineHealthCheckStatus defines the observed state of MachineHealthCheck. // +kubebuilder:validation:MinProperties=1 type MachineHealthCheckStatus struct { diff --git a/api/core/v1beta2/v1beta1_condition_consts.go b/api/core/v1beta2/v1beta1_condition_consts.go index aef565c0aa2f..7de433cc25db 100644 --- a/api/core/v1beta2/v1beta1_condition_consts.go +++ b/api/core/v1beta2/v1beta1_condition_consts.go @@ -157,6 +157,9 @@ const ( // UnhealthyNodeConditionV1Beta1Reason is the reason used when a machine's node has one of the MachineHealthCheck's unhealthy conditions. UnhealthyNodeConditionV1Beta1Reason = "UnhealthyNode" + + // UnhealthyMachineConditionV1Beta1Reason is the reason used when a machine has one of the MachineHealthCheck's unhealthy conditions. + UnhealthyMachineConditionV1Beta1Reason = "UnhealthyMachine" ) const ( diff --git a/api/core/v1beta2/zz_generated.deepcopy.go b/api/core/v1beta2/zz_generated.deepcopy.go index 0c5ee6208d3c..b8707d841111 100644 --- a/api/core/v1beta2/zz_generated.deepcopy.go +++ b/api/core/v1beta2/zz_generated.deepcopy.go @@ -840,6 +840,13 @@ func (in *ControlPlaneClassHealthCheckChecks) DeepCopyInto(out *ControlPlaneClas (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.UnhealthyMachineConditions != nil { + in, out := &in.UnhealthyMachineConditions, &out.UnhealthyMachineConditions + *out = make([]UnhealthyMachineCondition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ControlPlaneClassHealthCheckChecks. @@ -1016,6 +1023,13 @@ func (in *ControlPlaneTopologyHealthCheckChecks) DeepCopyInto(out *ControlPlaneT (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.UnhealthyMachineConditions != nil { + in, out := &in.UnhealthyMachineConditions, &out.UnhealthyMachineConditions + *out = make([]UnhealthyMachineCondition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ControlPlaneTopologyHealthCheckChecks. @@ -1604,6 +1618,13 @@ func (in *MachineDeploymentClassHealthCheckChecks) DeepCopyInto(out *MachineDepl (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.UnhealthyMachineConditions != nil { + in, out := &in.UnhealthyMachineConditions, &out.UnhealthyMachineConditions + *out = make([]UnhealthyMachineCondition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MachineDeploymentClassHealthCheckChecks. @@ -2071,6 +2092,13 @@ func (in *MachineDeploymentTopologyHealthCheckChecks) DeepCopyInto(out *MachineD (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.UnhealthyMachineConditions != nil { + in, out := &in.UnhealthyMachineConditions, &out.UnhealthyMachineConditions + *out = make([]UnhealthyMachineCondition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MachineDeploymentTopologyHealthCheckChecks. @@ -2476,6 +2504,13 @@ func (in *MachineHealthCheckChecks) DeepCopyInto(out *MachineHealthCheckChecks) (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.UnhealthyMachineConditions != nil { + in, out := &in.UnhealthyMachineConditions, &out.UnhealthyMachineConditions + *out = make([]UnhealthyMachineCondition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new MachineHealthCheckChecks. @@ -3701,6 +3736,26 @@ func (in *Topology) DeepCopy() *Topology { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *UnhealthyMachineCondition) DeepCopyInto(out *UnhealthyMachineCondition) { + *out = *in + if in.TimeoutSeconds != nil { + in, out := &in.TimeoutSeconds, &out.TimeoutSeconds + *out = new(int32) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new UnhealthyMachineCondition. +func (in *UnhealthyMachineCondition) DeepCopy() *UnhealthyMachineCondition { + if in == nil { + return nil + } + out := new(UnhealthyMachineCondition) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *UnhealthyNodeCondition) DeepCopyInto(out *UnhealthyNodeCondition) { *out = *in diff --git a/api/core/v1beta2/zz_generated.openapi.go b/api/core/v1beta2/zz_generated.openapi.go index 42744b6836e6..65aafa05e781 100644 --- a/api/core/v1beta2/zz_generated.openapi.go +++ b/api/core/v1beta2/zz_generated.openapi.go @@ -173,6 +173,7 @@ func GetOpenAPIDefinitions(ref common.ReferenceCallback) map[string]common.OpenA "sigs.k8s.io/cluster-api/api/core/v1beta2.PatchSelectorMatchMachineDeploymentClass": schema_cluster_api_api_core_v1beta2_PatchSelectorMatchMachineDeploymentClass(ref), "sigs.k8s.io/cluster-api/api/core/v1beta2.PatchSelectorMatchMachinePoolClass": schema_cluster_api_api_core_v1beta2_PatchSelectorMatchMachinePoolClass(ref), "sigs.k8s.io/cluster-api/api/core/v1beta2.Topology": schema_cluster_api_api_core_v1beta2_Topology(ref), + "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyMachineCondition": schema_cluster_api_api_core_v1beta2_UnhealthyMachineCondition(ref), "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyNodeCondition": schema_cluster_api_api_core_v1beta2_UnhealthyNodeCondition(ref), "sigs.k8s.io/cluster-api/api/core/v1beta2.ValidationRule": schema_cluster_api_api_core_v1beta2_ValidationRule(ref), "sigs.k8s.io/cluster-api/api/core/v1beta2.VariableSchema": schema_cluster_api_api_core_v1beta2_VariableSchema(ref), @@ -1687,11 +1688,30 @@ func schema_cluster_api_api_core_v1beta2_ControlPlaneClassHealthCheckChecks(ref }, }, }, + "unhealthyMachineConditions": { + VendorExtensible: spec.VendorExtensible{ + Extensions: spec.Extensions{ + "x-kubernetes-list-type": "atomic", + }, + }, + SchemaProps: spec.SchemaProps{ + Description: "unhealthyMachineConditions contains a list of the machine conditions that determine whether a machine is considered unhealthy. The conditions are combined in a logical OR, i.e. if any of the conditions is met, the machine is unhealthy.", + Type: []string{"array"}, + Items: &spec.SchemaOrArray{ + Schema: &spec.Schema{ + SchemaProps: spec.SchemaProps{ + Default: map[string]interface{}{}, + Ref: ref("sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyMachineCondition"), + }, + }, + }, + }, + }, }, }, }, Dependencies: []string{ - "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyNodeCondition"}, + "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyMachineCondition", "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyNodeCondition"}, } } @@ -1970,11 +1990,30 @@ func schema_cluster_api_api_core_v1beta2_ControlPlaneTopologyHealthCheckChecks(r }, }, }, + "unhealthyMachineConditions": { + VendorExtensible: spec.VendorExtensible{ + Extensions: spec.Extensions{ + "x-kubernetes-list-type": "atomic", + }, + }, + SchemaProps: spec.SchemaProps{ + Description: "unhealthyMachineConditions contains a list of the machine conditions that determine whether a machine is considered unhealthy. The conditions are combined in a logical OR, i.e. if any of the conditions is met, the machine is unhealthy.", + Type: []string{"array"}, + Items: &spec.SchemaOrArray{ + Schema: &spec.Schema{ + SchemaProps: spec.SchemaProps{ + Default: map[string]interface{}{}, + Ref: ref("sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyMachineCondition"), + }, + }, + }, + }, + }, }, }, }, Dependencies: []string{ - "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyNodeCondition"}, + "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyMachineCondition", "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyNodeCondition"}, } } @@ -2991,11 +3030,30 @@ func schema_cluster_api_api_core_v1beta2_MachineDeploymentClassHealthCheckChecks }, }, }, + "unhealthyMachineConditions": { + VendorExtensible: spec.VendorExtensible{ + Extensions: spec.Extensions{ + "x-kubernetes-list-type": "atomic", + }, + }, + SchemaProps: spec.SchemaProps{ + Description: "unhealthyMachineConditions contains a list of the machine conditions that determine whether a machine is considered unhealthy. The conditions are combined in a logical OR, i.e. if any of the conditions is met, the machine is unhealthy.", + Type: []string{"array"}, + Items: &spec.SchemaOrArray{ + Schema: &spec.Schema{ + SchemaProps: spec.SchemaProps{ + Default: map[string]interface{}{}, + Ref: ref("sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyMachineCondition"), + }, + }, + }, + }, + }, }, }, }, Dependencies: []string{ - "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyNodeCondition"}, + "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyMachineCondition", "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyNodeCondition"}, } } @@ -3771,11 +3829,30 @@ func schema_cluster_api_api_core_v1beta2_MachineDeploymentTopologyHealthCheckChe }, }, }, + "unhealthyMachineConditions": { + VendorExtensible: spec.VendorExtensible{ + Extensions: spec.Extensions{ + "x-kubernetes-list-type": "atomic", + }, + }, + SchemaProps: spec.SchemaProps{ + Description: "unhealthyMachineConditions contains a list of the machine conditions that determine whether a machine is considered unhealthy. The conditions are combined in a logical OR, i.e. if any of the conditions is met, the machine is unhealthy.", + Type: []string{"array"}, + Items: &spec.SchemaOrArray{ + Schema: &spec.Schema{ + SchemaProps: spec.SchemaProps{ + Default: map[string]interface{}{}, + Ref: ref("sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyMachineCondition"), + }, + }, + }, + }, + }, }, }, }, Dependencies: []string{ - "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyNodeCondition"}, + "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyMachineCondition", "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyNodeCondition"}, } } @@ -4403,11 +4480,30 @@ func schema_cluster_api_api_core_v1beta2_MachineHealthCheckChecks(ref common.Ref }, }, }, + "unhealthyMachineConditions": { + VendorExtensible: spec.VendorExtensible{ + Extensions: spec.Extensions{ + "x-kubernetes-list-type": "atomic", + }, + }, + SchemaProps: spec.SchemaProps{ + Description: "unhealthyMachineConditions contains a list of the machine conditions that determine whether a machine is considered unhealthy. The conditions are combined in a logical OR, i.e. if any of the conditions is met, the machine is unhealthy.", + Type: []string{"array"}, + Items: &spec.SchemaOrArray{ + Schema: &spec.Schema{ + SchemaProps: spec.SchemaProps{ + Default: map[string]interface{}{}, + Ref: ref("sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyMachineCondition"), + }, + }, + }, + }, + }, }, }, }, Dependencies: []string{ - "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyNodeCondition"}, + "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyMachineCondition", "sigs.k8s.io/cluster-api/api/core/v1beta2.UnhealthyNodeCondition"}, } } @@ -6589,6 +6685,41 @@ func schema_cluster_api_api_core_v1beta2_Topology(ref common.ReferenceCallback) } } +func schema_cluster_api_api_core_v1beta2_UnhealthyMachineCondition(ref common.ReferenceCallback) common.OpenAPIDefinition { + return common.OpenAPIDefinition{ + Schema: spec.Schema{ + SchemaProps: spec.SchemaProps{ + Description: "UnhealthyMachineCondition represents a Machine condition type and value with a timeout specified as a duration. When the named condition has been in the given status for at least the timeout value, a machine is considered unhealthy.", + Type: []string{"object"}, + Properties: map[string]spec.Schema{ + "type": { + SchemaProps: spec.SchemaProps{ + Description: "type of Machine condition", + Type: []string{"string"}, + Format: "", + }, + }, + "status": { + SchemaProps: spec.SchemaProps{ + Description: "status of the condition, one of True, False, Unknown.", + Type: []string{"string"}, + Format: "", + }, + }, + "timeoutSeconds": { + SchemaProps: spec.SchemaProps{ + Description: "timeoutSeconds is the duration that a machine must be in a given status for, after which the machine is considered unhealthy. For example, with a value of \"1h\", the machine must match the status for at least 1 hour before being considered unhealthy.", + Type: []string{"integer"}, + Format: "int32", + }, + }, + }, + Required: []string{"type", "status", "timeoutSeconds"}, + }, + }, + } +} + func schema_cluster_api_api_core_v1beta2_UnhealthyNodeCondition(ref common.ReferenceCallback) common.OpenAPIDefinition { return common.OpenAPIDefinition{ Schema: spec.Schema{ diff --git a/config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml b/config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml index 19c37ad21aaa..c4b83e79fcaa 100644 --- a/config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml +++ b/config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml @@ -2969,6 +2969,48 @@ spec: format: int32 minimum: 0 type: integer + unhealthyMachineConditions: + description: |- + unhealthyMachineConditions contains a list of the machine conditions that determine + whether a machine is considered unhealthy. The conditions are combined in a + logical OR, i.e. if any of the conditions is met, the machine is unhealthy. + items: + description: |- + UnhealthyMachineCondition represents a Machine condition type and value with a timeout + specified as a duration. When the named condition has been in the given + status for at least the timeout value, a machine is considered unhealthy. + properties: + status: + description: status of the condition, one of True, + False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + timeoutSeconds: + description: |- + timeoutSeconds is the duration that a machine must be in a given status for, + after which the machine is considered unhealthy. + For example, with a value of "1h", the machine must match the status + for at least 1 hour before being considered unhealthy. + format: int32 + minimum: 0 + type: integer + type: + description: type of Machine condition + maxLength: 316 + minLength: 1 + type: string + required: + - status + - timeoutSeconds + - type + type: object + maxItems: 100 + minItems: 1 + type: array + x-kubernetes-list-type: atomic unhealthyNodeConditions: description: |- unhealthyNodeConditions contains a list of conditions that determine @@ -4146,6 +4188,48 @@ spec: format: int32 minimum: 0 type: integer + unhealthyMachineConditions: + description: |- + unhealthyMachineConditions contains a list of the machine conditions that determine + whether a machine is considered unhealthy. The conditions are combined in a + logical OR, i.e. if any of the conditions is met, the machine is unhealthy. + items: + description: |- + UnhealthyMachineCondition represents a Machine condition type and value with a timeout + specified as a duration. When the named condition has been in the given + status for at least the timeout value, a machine is considered unhealthy. + properties: + status: + description: status of the condition, one + of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + timeoutSeconds: + description: |- + timeoutSeconds is the duration that a machine must be in a given status for, + after which the machine is considered unhealthy. + For example, with a value of "1h", the machine must match the status + for at least 1 hour before being considered unhealthy. + format: int32 + minimum: 0 + type: integer + type: + description: type of Machine condition + maxLength: 316 + minLength: 1 + type: string + required: + - status + - timeoutSeconds + - type + type: object + maxItems: 100 + minItems: 1 + type: array + x-kubernetes-list-type: atomic unhealthyNodeConditions: description: |- unhealthyNodeConditions contains a list of conditions that determine diff --git a/config/crd/bases/cluster.x-k8s.io_clusters.yaml b/config/crd/bases/cluster.x-k8s.io_clusters.yaml index e3b22be95125..124f314076c2 100644 --- a/config/crd/bases/cluster.x-k8s.io_clusters.yaml +++ b/config/crd/bases/cluster.x-k8s.io_clusters.yaml @@ -2531,6 +2531,48 @@ spec: format: int32 minimum: 0 type: integer + unhealthyMachineConditions: + description: |- + unhealthyMachineConditions contains a list of the machine conditions that determine + whether a machine is considered unhealthy. The conditions are combined in a + logical OR, i.e. if any of the conditions is met, the machine is unhealthy. + items: + description: |- + UnhealthyMachineCondition represents a Machine condition type and value with a timeout + specified as a duration. When the named condition has been in the given + status for at least the timeout value, a machine is considered unhealthy. + properties: + status: + description: status of the condition, one of + True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + timeoutSeconds: + description: |- + timeoutSeconds is the duration that a machine must be in a given status for, + after which the machine is considered unhealthy. + For example, with a value of "1h", the machine must match the status + for at least 1 hour before being considered unhealthy. + format: int32 + minimum: 0 + type: integer + type: + description: type of Machine condition + maxLength: 316 + minLength: 1 + type: string + required: + - status + - timeoutSeconds + - type + type: object + maxItems: 100 + minItems: 1 + type: array + x-kubernetes-list-type: atomic unhealthyNodeConditions: description: |- unhealthyNodeConditions contains a list of conditions that determine @@ -2932,6 +2974,48 @@ spec: format: int32 minimum: 0 type: integer + unhealthyMachineConditions: + description: |- + unhealthyMachineConditions contains a list of the machine conditions that determine + whether a machine is considered unhealthy. The conditions are combined in a + logical OR, i.e. if any of the conditions is met, the machine is unhealthy. + items: + description: |- + UnhealthyMachineCondition represents a Machine condition type and value with a timeout + specified as a duration. When the named condition has been in the given + status for at least the timeout value, a machine is considered unhealthy. + properties: + status: + description: status of the condition, + one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + timeoutSeconds: + description: |- + timeoutSeconds is the duration that a machine must be in a given status for, + after which the machine is considered unhealthy. + For example, with a value of "1h", the machine must match the status + for at least 1 hour before being considered unhealthy. + format: int32 + minimum: 0 + type: integer + type: + description: type of Machine condition + maxLength: 316 + minLength: 1 + type: string + required: + - status + - timeoutSeconds + - type + type: object + maxItems: 100 + minItems: 1 + type: array + x-kubernetes-list-type: atomic unhealthyNodeConditions: description: |- unhealthyNodeConditions contains a list of conditions that determine diff --git a/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml b/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml index cd6cfc67d52c..559cab242b23 100644 --- a/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml +++ b/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml @@ -1065,6 +1065,48 @@ spec: format: int32 minimum: 0 type: integer + unhealthyMachineConditions: + description: |- + unhealthyMachineConditions contains a list of the machine conditions that determine + whether a machine is considered unhealthy. The conditions are combined in a + logical OR, i.e. if any of the conditions is met, the machine is unhealthy. + items: + description: |- + UnhealthyMachineCondition represents a Machine condition type and value with a timeout + specified as a duration. When the named condition has been in the given + status for at least the timeout value, a machine is considered unhealthy. + properties: + status: + description: status of the condition, one of True, False, + Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + timeoutSeconds: + description: |- + timeoutSeconds is the duration that a machine must be in a given status for, + after which the machine is considered unhealthy. + For example, with a value of "1h", the machine must match the status + for at least 1 hour before being considered unhealthy. + format: int32 + minimum: 0 + type: integer + type: + description: type of Machine condition + maxLength: 316 + minLength: 1 + type: string + required: + - status + - timeoutSeconds + - type + type: object + maxItems: 100 + minItems: 1 + type: array + x-kubernetes-list-type: atomic unhealthyNodeConditions: description: |- unhealthyNodeConditions contains a list of conditions that determine diff --git a/docs/book/src/tasks/automated-machine-management/healthchecking.md b/docs/book/src/tasks/automated-machine-management/healthchecking.md index 124acd65bf9b..0630e4db0e42 100644 --- a/docs/book/src/tasks/automated-machine-management/healthchecking.md +++ b/docs/book/src/tasks/automated-machine-management/healthchecking.md @@ -58,6 +58,13 @@ spec: - type: Ready status: "False" timeout: 300s + unhealthyMachineConditions: + - type: "Ready" + status: Unknown + timeout: 300s + - type: "Ready" + status: "False" + timeout: 300s ``` Use this example as the basis for defining a MachineHealthCheck for control plane nodes managed via diff --git a/docs/book/src/tasks/experimental-features/cluster-class/write-clusterclass.md b/docs/book/src/tasks/experimental-features/cluster-class/write-clusterclass.md index 946a5601af12..7a9ad1ae21de 100644 --- a/docs/book/src/tasks/experimental-features/cluster-class/write-clusterclass.md +++ b/docs/book/src/tasks/experimental-features/cluster-class/write-clusterclass.md @@ -194,6 +194,13 @@ spec: - type: Ready status: "False" timeoutSeconds: 300 + unhealthyMachineConditions: + - type: "Ready" + status: Unknown + timeoutSeconds: 300 + - type: "Ready" + status: "False" + timeoutSeconds: 300 remediation: triggerIf: unhealthyLessThanOrEqualTo: 33% @@ -211,6 +218,13 @@ spec: - type: Ready status: "False" timeoutSeconds: 300 + unhealthyMachineConditions: + - type: Ready + status: Unknown + timeoutSeconds: 300 + - type: Ready + status: "False" + timeoutSeconds: 300 remediation: triggerIf: unhealthyInRange: "[0-2]" diff --git a/docs/book/src/tasks/using-kustomize.md b/docs/book/src/tasks/using-kustomize.md index 71f8b4c129da..34b0920932bb 100644 --- a/docs/book/src/tasks/using-kustomize.md +++ b/docs/book/src/tasks/using-kustomize.md @@ -96,6 +96,13 @@ spec: - type: Ready status: "False" timeout: 300s + unhealthyMachineConditions: + - type: "Ready" + status: Unknown + timeout: 300s + - type: "Ready" + status: "False" + timeout: 300s ``` You would want to ensure the `clusterName` field in the MachineHealthCheck manifest appropriately diff --git a/exp/topology/desiredstate/desired_state_test.go b/exp/topology/desiredstate/desired_state_test.go index 2639ee076338..60e497b50736 100644 --- a/exp/topology/desiredstate/desired_state_test.go +++ b/exp/topology/desiredstate/desired_state_test.go @@ -42,6 +42,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" clusterv1beta1 "sigs.k8s.io/cluster-api/api/core/v1beta1" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" runtimehooksv1 "sigs.k8s.io/cluster-api/api/runtime/hooks/v1alpha1" @@ -2457,6 +2458,20 @@ func TestComputeMachineDeployment(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, } + + unhealthyMachineConditions := []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + } + nodeTimeoutDuration := ptr.To(int32(1)) clusterClassFailureDomain := "A" @@ -2484,8 +2499,9 @@ func TestComputeMachineDeployment(t *testing.T) { WithBootstrapTemplate(workerBootstrapTemplate). WithMachineHealthCheckClass(clusterv1.MachineDeploymentClassHealthCheck{ Checks: clusterv1.MachineDeploymentClassHealthCheckChecks{ - UnhealthyNodeConditions: unhealthyNodeConditions, - NodeStartupTimeoutSeconds: nodeTimeoutDuration, + UnhealthyNodeConditions: unhealthyNodeConditions, + UnhealthyMachineConditions: unhealthyMachineConditions, + NodeStartupTimeoutSeconds: nodeTimeoutDuration, }, }). WithReadinessGates(clusterClassReadinessGates). @@ -2529,8 +2545,9 @@ func TestComputeMachineDeployment(t *testing.T) { InfrastructureMachineTemplate: workerInfrastructureMachineTemplate, HealthCheck: clusterv1.MachineDeploymentClassHealthCheck{ Checks: clusterv1.MachineDeploymentClassHealthCheckChecks{ - UnhealthyNodeConditions: unhealthyNodeConditions, - NodeStartupTimeoutSeconds: ptr.To(int32(1)), + UnhealthyNodeConditions: unhealthyNodeConditions, + UnhealthyMachineConditions: unhealthyMachineConditions, + NodeStartupTimeoutSeconds: ptr.To(int32(1)), }, }, }, @@ -2705,8 +2722,9 @@ func TestComputeMachineDeployment(t *testing.T) { InfrastructureMachineTemplate: workerInfrastructureMachineTemplate, HealthCheck: clusterv1.MachineDeploymentClassHealthCheck{ Checks: clusterv1.MachineDeploymentClassHealthCheckChecks{ - UnhealthyNodeConditions: unhealthyNodeConditions, - NodeStartupTimeoutSeconds: ptr.To(int32(1)), + UnhealthyNodeConditions: unhealthyNodeConditions, + UnhealthyMachineConditions: unhealthyMachineConditions, + NodeStartupTimeoutSeconds: ptr.To(int32(1)), }, }, }, @@ -2974,6 +2992,9 @@ func TestComputeMachineDeployment(t *testing.T) { // Check that UnhealthyNodeConditions are set as expected. g.Expect(actual.MachineHealthCheck.Spec.Checks.UnhealthyNodeConditions).To(BeComparableTo(unhealthyNodeConditions)) + + // Check that UnhealthyMachineConditions are set as expected. + g.Expect(actual.MachineHealthCheck.Spec.Checks.UnhealthyMachineConditions).To(BeComparableTo(unhealthyMachineConditions)) }) } @@ -4260,6 +4281,18 @@ func Test_computeMachineHealthCheck(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, NodeStartupTimeoutSeconds: ptr.To(int32(1)), } selector := &metav1.LabelSelector{MatchLabels: map[string]string{ @@ -4302,6 +4335,18 @@ func Test_computeMachineHealthCheck(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, NodeStartupTimeoutSeconds: ptr.To(int32(1)), }, }, diff --git a/exp/topology/scope/blueprint.go b/exp/topology/scope/blueprint.go index 15c20fe3c597..5a6786519dc7 100644 --- a/exp/topology/scope/blueprint.go +++ b/exp/topology/scope/blueprint.go @@ -118,8 +118,9 @@ func (b *ClusterBlueprint) IsControlPlaneMachineHealthCheckEnabled() bool { func (b *ClusterBlueprint) ControlPlaneMachineHealthCheckClass() (clusterv1.MachineHealthCheckChecks, clusterv1.MachineHealthCheckRemediation) { if b.Topology.ControlPlane.HealthCheck.IsDefined() { return clusterv1.MachineHealthCheckChecks{ - NodeStartupTimeoutSeconds: b.Topology.ControlPlane.HealthCheck.Checks.NodeStartupTimeoutSeconds, - UnhealthyNodeConditions: b.Topology.ControlPlane.HealthCheck.Checks.UnhealthyNodeConditions, + NodeStartupTimeoutSeconds: b.Topology.ControlPlane.HealthCheck.Checks.NodeStartupTimeoutSeconds, + UnhealthyNodeConditions: b.Topology.ControlPlane.HealthCheck.Checks.UnhealthyNodeConditions, + UnhealthyMachineConditions: b.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions, }, clusterv1.MachineHealthCheckRemediation{ TriggerIf: clusterv1.MachineHealthCheckRemediationTriggerIf{ UnhealthyLessThanOrEqualTo: b.Topology.ControlPlane.HealthCheck.Remediation.TriggerIf.UnhealthyLessThanOrEqualTo, @@ -130,8 +131,9 @@ func (b *ClusterBlueprint) ControlPlaneMachineHealthCheckClass() (clusterv1.Mach } return clusterv1.MachineHealthCheckChecks{ - NodeStartupTimeoutSeconds: b.ControlPlane.HealthCheck.Checks.NodeStartupTimeoutSeconds, - UnhealthyNodeConditions: b.ControlPlane.HealthCheck.Checks.UnhealthyNodeConditions, + NodeStartupTimeoutSeconds: b.ControlPlane.HealthCheck.Checks.NodeStartupTimeoutSeconds, + UnhealthyNodeConditions: b.ControlPlane.HealthCheck.Checks.UnhealthyNodeConditions, + UnhealthyMachineConditions: b.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions, }, clusterv1.MachineHealthCheckRemediation{ TriggerIf: clusterv1.MachineHealthCheckRemediationTriggerIf{ UnhealthyLessThanOrEqualTo: b.ControlPlane.HealthCheck.Remediation.TriggerIf.UnhealthyLessThanOrEqualTo, @@ -165,8 +167,9 @@ func (b *ClusterBlueprint) IsMachineDeploymentMachineHealthCheckEnabled(md *clus func (b *ClusterBlueprint) MachineDeploymentMachineHealthCheckClass(md *clusterv1.MachineDeploymentTopology) (clusterv1.MachineHealthCheckChecks, clusterv1.MachineHealthCheckRemediation) { if md.HealthCheck.IsDefined() { return clusterv1.MachineHealthCheckChecks{ - NodeStartupTimeoutSeconds: md.HealthCheck.Checks.NodeStartupTimeoutSeconds, - UnhealthyNodeConditions: md.HealthCheck.Checks.UnhealthyNodeConditions, + NodeStartupTimeoutSeconds: md.HealthCheck.Checks.NodeStartupTimeoutSeconds, + UnhealthyNodeConditions: md.HealthCheck.Checks.UnhealthyNodeConditions, + UnhealthyMachineConditions: md.HealthCheck.Checks.UnhealthyMachineConditions, }, clusterv1.MachineHealthCheckRemediation{ TriggerIf: clusterv1.MachineHealthCheckRemediationTriggerIf{ UnhealthyLessThanOrEqualTo: md.HealthCheck.Remediation.TriggerIf.UnhealthyLessThanOrEqualTo, @@ -177,8 +180,9 @@ func (b *ClusterBlueprint) MachineDeploymentMachineHealthCheckClass(md *clusterv } return clusterv1.MachineHealthCheckChecks{ - NodeStartupTimeoutSeconds: b.MachineDeployments[md.Class].HealthCheck.Checks.NodeStartupTimeoutSeconds, - UnhealthyNodeConditions: b.MachineDeployments[md.Class].HealthCheck.Checks.UnhealthyNodeConditions, + NodeStartupTimeoutSeconds: b.MachineDeployments[md.Class].HealthCheck.Checks.NodeStartupTimeoutSeconds, + UnhealthyNodeConditions: b.MachineDeployments[md.Class].HealthCheck.Checks.UnhealthyNodeConditions, + UnhealthyMachineConditions: b.MachineDeployments[md.Class].HealthCheck.Checks.UnhealthyMachineConditions, }, clusterv1.MachineHealthCheckRemediation{ TriggerIf: clusterv1.MachineHealthCheckRemediationTriggerIf{ UnhealthyLessThanOrEqualTo: b.MachineDeployments[md.Class].HealthCheck.Remediation.TriggerIf.UnhealthyLessThanOrEqualTo, diff --git a/exp/topology/scope/blueprint_test.go b/exp/topology/scope/blueprint_test.go index ca0cf5c635bb..0a244fe0bdb6 100644 --- a/exp/topology/scope/blueprint_test.go +++ b/exp/topology/scope/blueprint_test.go @@ -26,6 +26,7 @@ import ( "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/utils/ptr" + controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/util/test/builder" ) @@ -70,6 +71,13 @@ func TestIsControlPlaneMachineHealthCheckEnabled(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build(), @@ -93,6 +101,13 @@ func TestIsControlPlaneMachineHealthCheckEnabled(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build(), @@ -119,6 +134,13 @@ func TestIsControlPlaneMachineHealthCheckEnabled(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build(), @@ -148,6 +170,13 @@ func TestIsControlPlaneMachineHealthCheckEnabled(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build(), @@ -172,6 +201,13 @@ func TestIsControlPlaneMachineHealthCheckEnabled(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build(), @@ -196,6 +232,13 @@ func TestIsControlPlaneMachineHealthCheckEnabled(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build(), @@ -232,6 +275,13 @@ func TestControlPlaneMachineHealthCheckClass(t *testing.T) { TimeoutSeconds: ptr.To(int32(20 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, Remediation: clusterv1.ControlPlaneTopologyHealthCheckRemediation{ TriggerIf: clusterv1.ControlPlaneTopologyHealthCheckRemediationTriggerIf{ @@ -250,6 +300,13 @@ func TestControlPlaneMachineHealthCheckClass(t *testing.T) { TimeoutSeconds: ptr.To(int32(10 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }, }, @@ -262,6 +319,13 @@ func TestControlPlaneMachineHealthCheckClass(t *testing.T) { TimeoutSeconds: ptr.To(int32(20 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, wantRemediation: clusterv1.MachineHealthCheckRemediation{ TriggerIf: clusterv1.MachineHealthCheckRemediationTriggerIf{ @@ -285,6 +349,13 @@ func TestControlPlaneMachineHealthCheckClass(t *testing.T) { TimeoutSeconds: ptr.To(int32(10 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }, }, @@ -297,6 +368,13 @@ func TestControlPlaneMachineHealthCheckClass(t *testing.T) { TimeoutSeconds: ptr.To(int32(10 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, wantRemediation: clusterv1.MachineHealthCheckRemediation{}, }, @@ -345,6 +423,13 @@ func TestIsMachineDeploymentMachineHealthCheckEnabled(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }, }, @@ -369,6 +454,13 @@ func TestIsMachineDeploymentMachineHealthCheckEnabled(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }, }, @@ -396,6 +488,13 @@ func TestIsMachineDeploymentMachineHealthCheckEnabled(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }, }, @@ -427,6 +526,13 @@ func TestIsMachineDeploymentMachineHealthCheckEnabled(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }, }, @@ -451,6 +557,13 @@ func TestIsMachineDeploymentMachineHealthCheckEnabled(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }, }, @@ -475,6 +588,13 @@ func TestIsMachineDeploymentMachineHealthCheckEnabled(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }, }, @@ -512,6 +632,13 @@ func TestMachineDeploymentMachineHealthCheckClass(t *testing.T) { TimeoutSeconds: ptr.To(int32(10 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }, }, @@ -528,6 +655,13 @@ func TestMachineDeploymentMachineHealthCheckClass(t *testing.T) { TimeoutSeconds: ptr.To(int32(20 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, Remediation: clusterv1.MachineDeploymentTopologyHealthCheckRemediation{ TriggerIf: clusterv1.MachineDeploymentTopologyHealthCheckRemediationTriggerIf{ @@ -544,6 +678,13 @@ func TestMachineDeploymentMachineHealthCheckClass(t *testing.T) { TimeoutSeconds: ptr.To(int32(20 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, wantRemediation: clusterv1.MachineHealthCheckRemediation{ TriggerIf: clusterv1.MachineHealthCheckRemediationTriggerIf{ @@ -565,6 +706,13 @@ func TestMachineDeploymentMachineHealthCheckClass(t *testing.T) { TimeoutSeconds: ptr.To(int32(10 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }, }, @@ -582,6 +730,13 @@ func TestMachineDeploymentMachineHealthCheckClass(t *testing.T) { TimeoutSeconds: ptr.To(int32(10 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, wantRemediation: clusterv1.MachineHealthCheckRemediation{}, }, diff --git a/internal/api/core/v1alpha3/conversion_test.go b/internal/api/core/v1alpha3/conversion_test.go index 49ee38de30bb..4bac92df62b5 100644 --- a/internal/api/core/v1alpha3/conversion_test.go +++ b/internal/api/core/v1alpha3/conversion_test.go @@ -417,6 +417,7 @@ func spokeCluster(in *Cluster, c randfill.Continue) { func MachineHealthCheckFuzzFunc(_ runtimeserializer.CodecFactory) []interface{} { return []interface{}{ hubMachineHealthCheckStatus, + hubMachineHealthCheckSpec, hubUnhealthyNodeCondition, spokeMachineHealthCheck, spokeMachineHealthCheckSpec, @@ -425,6 +426,15 @@ func MachineHealthCheckFuzzFunc(_ runtimeserializer.CodecFactory) []interface{} } } +func hubMachineHealthCheckSpec(in *clusterv1.MachineHealthCheckSpec, c randfill.Continue) { + c.FillNoCustom(in) + + // Drop UnhealthyMachineConditions as it does not exist in v1alpha3. + if in.Checks.UnhealthyMachineConditions != nil { + in.Checks.UnhealthyMachineConditions = nil + } +} + func hubMachineHealthCheckStatus(in *clusterv1.MachineHealthCheckStatus, c randfill.Continue) { c.FillNoCustom(in) // Drop empty structs with only omit empty fields. diff --git a/internal/api/core/v1alpha4/conversion_test.go b/internal/api/core/v1alpha4/conversion_test.go index 126009a1d891..8d6ca8f84b6b 100644 --- a/internal/api/core/v1alpha4/conversion_test.go +++ b/internal/api/core/v1alpha4/conversion_test.go @@ -486,6 +486,7 @@ func spokeMachineDeploymentSpec(in *MachineDeploymentSpec, c randfill.Continue) func MachineHealthCheckFuzzFunc(_ runtimeserializer.CodecFactory) []interface{} { return []interface{}{ hubMachineHealthCheckStatus, + hubMachineHealthCheckSpec, hubUnhealthyNodeCondition, spokeMachineHealthCheck, spokeMachineHealthCheckSpec, @@ -494,6 +495,15 @@ func MachineHealthCheckFuzzFunc(_ runtimeserializer.CodecFactory) []interface{} } } +func hubMachineHealthCheckSpec(in *clusterv1.MachineHealthCheckSpec, c randfill.Continue) { + c.FillNoCustom(in) + + // Drop UnhealthyMachineConditions as it does not exist in v1alpha4. + if in.Checks.UnhealthyMachineConditions != nil { + in.Checks.UnhealthyMachineConditions = nil + } +} + func hubMachineHealthCheckStatus(in *clusterv1.MachineHealthCheckStatus, c randfill.Continue) { c.FillNoCustom(in) // Drop empty structs with only omit empty fields. diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go b/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go index 6d8eb6e86465..47b39e1b742e 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go @@ -41,6 +41,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/api/core/v1beta2/index" "sigs.k8s.io/cluster-api/controllers/clustercache" @@ -1151,12 +1152,13 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { assertMachinesOwnerRemediated(g, mhc, 1) }) - t.Run("Machine's Node without conditions", func(t *testing.T) { + t.Run("Machine's Node and Machine without conditions", func(t *testing.T) { g := NewWithT(t) cluster := createCluster(g, ns.Name) mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) mhc.Spec.Checks.UnhealthyNodeConditions = nil + mhc.Spec.Checks.UnhealthyMachineConditions = nil g.Expect(env.Create(ctx, mhc)).To(Succeed()) defer func(do ...client.Object) { @@ -1324,6 +1326,115 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { assertMachinesOwnerRemediated(g, mhc, 1) }) + t.Run("should react when a Machine transitions to unhealthy", func(t *testing.T) { + g := NewWithT(t) + cluster := createCluster(g, ns.Name) + + mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) + + g.Expect(env.Create(ctx, mhc)).To(Succeed()) + defer func(do ...client.Object) { + g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) + }(cluster, mhc) + + // Healthy nodes and machines. + _, machines, cleanup := createMachinesWithNodes(g, cluster, + count(1), + firstMachineAsControlPlane(), + createNodeRefForMachine(true), + nodeStatus(corev1.ConditionTrue), + machineLabels(mhc.Spec.Selector.MatchLabels), + ) + defer cleanup() + targetMachines := make([]string, len(machines)) + for i, m := range machines { + targetMachines[i] = m.Name + } + sort.Strings(targetMachines) + + // Make sure the status matches. + g.Eventually(func() *clusterv1.MachineHealthCheckStatus { + err := env.Get(ctx, util.ObjectKey(mhc), mhc) + if err != nil { + return nil + } + return &mhc.Status + }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ + ExpectedMachines: ptr.To[int32](1), + CurrentHealthy: ptr.To[int32](1), + RemediationsAllowed: ptr.To[int32](1), + ObservedGeneration: 1, + Targets: targetMachines, + Deprecated: &clusterv1.MachineHealthCheckDeprecatedStatus{ + V1Beta1: &clusterv1.MachineHealthCheckV1Beta1DeprecatedStatus{ + Conditions: clusterv1.Conditions{ + { + Type: clusterv1.RemediationAllowedV1Beta1Condition, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedCondition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedReason, + }, + }, + })) + + assertMachinesNotHealthy(g, mhc, 0) + assertMachinesOwnerRemediated(g, mhc, 0) + + // Transition the machine to unhealthy. + machine := machines[0] + machinePatch := client.MergeFrom(machine.DeepCopy()) + machine.Status.Conditions = []metav1.Condition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)), + }, + } + g.Expect(env.Status().Patch(ctx, machine, machinePatch)).To(Succeed()) + + // Make sure the status matches. + g.Eventually(func() *clusterv1.MachineHealthCheckStatus { + err := env.Get(ctx, util.ObjectKey(mhc), mhc) + if err != nil { + return nil + } + return &mhc.Status + }).Should(MatchMachineHealthCheckStatus(&clusterv1.MachineHealthCheckStatus{ + ExpectedMachines: ptr.To[int32](1), + CurrentHealthy: ptr.To[int32](0), + RemediationsAllowed: ptr.To[int32](0), + ObservedGeneration: 1, + Targets: targetMachines, + Deprecated: &clusterv1.MachineHealthCheckDeprecatedStatus{ + V1Beta1: &clusterv1.MachineHealthCheckV1Beta1DeprecatedStatus{ + Conditions: clusterv1.Conditions{ + { + Type: clusterv1.RemediationAllowedV1Beta1Condition, + Status: corev1.ConditionTrue, + }, + }, + }, + }, + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedCondition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedReason, + }, + }, + })) + + assertMachinesNotHealthy(g, mhc, 1) + assertMachinesOwnerRemediated(g, mhc, 1) + }) + t.Run("when in a MachineSet, unhealthy machines should be deleted", func(t *testing.T) { g := NewWithT(t) cluster := createCluster(g, ns.Name) @@ -2767,6 +2878,13 @@ func newMachineHealthCheck(namespace, clusterName string) *clusterv1.MachineHeal TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: clusterv1.MachineOwnerRemediatedCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, Remediation: clusterv1.MachineHealthCheckRemediation{ TriggerIf: clusterv1.MachineHealthCheckRemediationTriggerIf{ diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go index f80820279bed..2cd9742f39ae 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go @@ -181,7 +181,7 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi return false, nextCheck } - // check conditions + // check node conditions for _, c := range t.MHC.Spec.Checks.UnhealthyNodeConditions { nodeCondition := getNodeCondition(t.Node, c.Type) @@ -191,7 +191,7 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi continue } - // If the condition has been in the unhealthy state for longer than the + // If the node condition has been in the unhealthy state for longer than the // timeout, return true with no requeue time. timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second @@ -214,6 +214,41 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi nextCheckTimes = append(nextCheckTimes, nextCheck) } } + + // check machine conditions + for _, c := range t.MHC.Spec.Checks.UnhealthyMachineConditions { + machineCondition := getMachineCondition(t.Machine, c.Type) + + // Skip when current machine condition is different from the one reported + // in the MachineHealthCheck. + if machineCondition == nil || machineCondition.Status != c.Status { + continue + } + + // If the machine condition has been in the unhealthy state for longer than the + // timeout, return true with no requeue time. + timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second + + if machineCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) { + v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.UnhealthyMachineConditionV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Condition %s on machine is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()) + logger.V(3).Info("Target is unhealthy: condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String()) + + conditions.Set(t.Machine, metav1.Condition{ + Type: clusterv1.MachineHealthCheckSucceededCondition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineHealthCheckUnhealthyMachineReason, + Message: fmt.Sprintf("Health check failed: Condition %s on Machine is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()), + }) + return true, time.Duration(0) + } + + durationUnhealthy := now.Sub(machineCondition.LastTransitionTime.Time) + nextCheck := timeoutSecondsDuration - durationUnhealthy + time.Second + if nextCheck > 0 { + nextCheckTimes = append(nextCheckTimes, nextCheck) + } + } + return false, minDuration(nextCheckTimes) } @@ -360,6 +395,16 @@ func getNodeCondition(node *corev1.Node, conditionType corev1.NodeConditionType) return nil } +// getMachineCondition returns machine condition by type. +func getMachineCondition(node *clusterv1.Machine, conditionType string) *metav1.Condition { + for _, cond := range node.Status.Conditions { + if cond.Type == conditionType { + return &cond + } + } + return nil +} + func minDuration(durations []time.Duration) time.Duration { if len(durations) == 0 { return time.Duration(0) diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go index b995cef7f4bb..aae1a1f6fedd 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go @@ -30,6 +30,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/util/conditions" v1beta1conditions "sigs.k8s.io/cluster-api/util/conditions/deprecated/v1beta1" @@ -71,6 +72,13 @@ func TestGetTargetsFromMHC(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }, } @@ -215,6 +223,7 @@ func TestHealthCheckTargets(t *testing.T) { timeoutForMachineToHaveNode := 10 * time.Minute disabledTimeoutForMachineToHaveNode := time.Duration(0) timeoutForUnhealthyNodeConditions := int32(5 * 60) + timeoutForUnhealthyMachineConditions := int32(5 * 60) // Create a test MHC testMHC := &clusterv1.MachineHealthCheck{ @@ -240,6 +249,18 @@ func TestHealthCheckTargets(t *testing.T) { TimeoutSeconds: ptr.To(timeoutForUnhealthyNodeConditions), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(timeoutForUnhealthyMachineConditions), + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(timeoutForUnhealthyMachineConditions), + }, + }, }, }, } @@ -370,7 +391,7 @@ func TestHealthCheckTargets(t *testing.T) { nodeMissing: false, } nodeUnknown400Condition := newFailedHealthCheckV1Beta1Condition(clusterv1.UnhealthyNodeConditionV1Beta1Reason, "Condition Ready on node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String()) - nodeUnknown400V1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckUnhealthyNodeReason, "Health check failed: Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String()) + nodeUnknown400V1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckUnhealthyNodeReason, "Health check failed: Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyMachineConditions) * time.Second).String()) // Target for when a node is healthy testNodeHealthy := newTestNode("node1") @@ -383,6 +404,36 @@ func TestHealthCheckTargets(t *testing.T) { nodeMissing: false, } + // Machine unhealthy for shorter than timeout + testMachineUnhealthy200 := newTestUnhealthyMachine("machine1", namespace, clusterName, "node1", mhcSelector, controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, metav1.ConditionFalse, 200*time.Second) + machineUnhealthy200 := healthCheckTarget{ + Cluster: cluster, + MHC: testMHC, + Machine: testMachineUnhealthy200, + Node: testNodeHealthy, + nodeMissing: false, + } + + // Machine unhealthy for longer than timeout + testMachineUnhealthy400 := newTestUnhealthyMachine("machine1", namespace, clusterName, "node1", mhcSelector, controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, metav1.ConditionFalse, 400*time.Second) + machineUnhealthy400 := healthCheckTarget{ + Cluster: cluster, + MHC: testMHC, + Machine: testMachineUnhealthy400, + Node: testNodeHealthy, + nodeMissing: false, + } + machineUnhealthy400Condition := newFailedHealthCheckV1Beta1Condition( + clusterv1.UnhealthyMachineConditionV1Beta1Reason, + "Condition EtcdPodHealthy on machine is reporting status False for more than %s", + (time.Duration(timeoutForUnhealthyMachineConditions) * time.Second).String(), + ) + machineUnhealthy400V1Beta2Condition := newFailedHealthCheckCondition( + clusterv1.MachineHealthCheckUnhealthyMachineReason, + "Health check failed: Condition EtcdPodHealthy on Machine is reporting status False for more than %s", + (time.Duration(timeoutForUnhealthyMachineConditions) * time.Second).String(), + ) + // Target for when the machine has the remediate machine annotation const annotationRemediationMsg = "Marked for remediation via remediate-machine annotation" const annotationRemediationV1Beta2Msg = "Health check failed: marked for remediation via cluster.x-k8s.io/remediate-machine annotation" @@ -455,6 +506,22 @@ func TestHealthCheckTargets(t *testing.T) { expectedNeedsRemediationV1Beta2Condition: []metav1.Condition{nodeUnknown400V1Beta2Condition}, expectedNextCheckTimes: []time.Duration{}, }, + { + desc: "when the machine condition has been unhealthy for shorter than the timeout", + targets: []healthCheckTarget{machineUnhealthy200}, + expectedHealthy: []healthCheckTarget{}, + expectedNeedsRemediation: []healthCheckTarget{}, + expectedNextCheckTimes: []time.Duration{100 * time.Second}, // 300s timeout - 200s elapsed = 100s remaining + }, + { + desc: "when the machine condition has been unhealthy for longer than the timeout", + targets: []healthCheckTarget{machineUnhealthy400}, + expectedHealthy: []healthCheckTarget{}, + expectedNeedsRemediation: []healthCheckTarget{machineUnhealthy400}, + expectedNeedsRemediationCondition: []clusterv1.Condition{machineUnhealthy400Condition}, + expectedNeedsRemediationV1Beta2Condition: []metav1.Condition{machineUnhealthy400V1Beta2Condition}, + expectedNextCheckTimes: []time.Duration{}, + }, { desc: "when the node is healthy", targets: []healthCheckTarget{nodeHealthy}, @@ -632,6 +699,51 @@ func newTestUnhealthyNode(name string, condition corev1.NodeConditionType, statu } } +func newTestUnhealthyMachine(name, namespace, clusterName, nodeName string, labels map[string]string, condition string, status metav1.ConditionStatus, unhealthyDuration time.Duration) *clusterv1.Machine { + // Copy the labels so that the map is unique to each test Machine + l := make(map[string]string) + for k, v := range labels { + l[k] = v + } + l[clusterv1.ClusterNameLabel] = clusterName + + bootstrap := "bootstrap" + return &clusterv1.Machine{ + TypeMeta: metav1.TypeMeta{ + APIVersion: clusterv1.GroupVersion.String(), + Kind: "Machine", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: l, + }, + Spec: clusterv1.MachineSpec{ + ClusterName: clusterName, + Bootstrap: clusterv1.Bootstrap{ + DataSecretName: &bootstrap, + }, + }, + Status: clusterv1.MachineStatus{ + Conditions: []metav1.Condition{ + { + Type: condition, + Status: status, + LastTransitionTime: metav1.NewTime(time.Now().Add(-unhealthyDuration)), + }, + }, + Initialization: clusterv1.MachineInitializationStatus{ + InfrastructureProvisioned: ptr.To(true), + BootstrapDataSecretCreated: ptr.To(true), + }, + Phase: string(clusterv1.MachinePhaseRunning), + NodeRef: clusterv1.MachineNodeReference{ + Name: nodeName, + }, + }, + } +} + func newFailedHealthCheckV1Beta1Condition(reason string, messageFormat string, messageArgs ...interface{}) clusterv1.Condition { return *v1beta1conditions.FalseCondition(clusterv1.MachineHealthCheckSucceededV1Beta1Condition, reason, clusterv1.ConditionSeverityWarning, messageFormat, messageArgs...) } diff --git a/internal/controllers/topology/cluster/current_state_test.go b/internal/controllers/topology/cluster/current_state_test.go index b73865f0d762..e9e5d348ec06 100644 --- a/internal/controllers/topology/cluster/current_state_test.go +++ b/internal/controllers/topology/cluster/current_state_test.go @@ -34,6 +34,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/fake" . "sigs.k8s.io/controller-runtime/pkg/envtest/komega" + controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/exp/topology/scope" "sigs.k8s.io/cluster-api/internal/topology/selectors" @@ -169,6 +170,18 @@ func testGetCurrentState(t *testing.T, controlPlaneContractVersion string) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }). + WithUnhealthyMachineConditions([]clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }). WithClusterName("cluster1"). Build() @@ -186,6 +199,18 @@ func testGetCurrentState(t *testing.T, controlPlaneContractVersion string) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }). + WithUnhealthyMachineConditions([]clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }). WithClusterName("cluster1"). Build() diff --git a/internal/controllers/topology/cluster/reconcile_state_test.go b/internal/controllers/topology/cluster/reconcile_state_test.go index 70f89a72ea62..3e2fed00704d 100644 --- a/internal/controllers/topology/cluster/reconcile_state_test.go +++ b/internal/controllers/topology/cluster/reconcile_state_test.go @@ -43,6 +43,7 @@ import ( . "sigs.k8s.io/controller-runtime/pkg/envtest/komega" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" clusterv1beta1 "sigs.k8s.io/cluster-api/api/core/v1beta1" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" runtimehooksv1 "sigs.k8s.io/cluster-api/api/runtime/hooks/v1alpha1" @@ -1906,6 +1907,13 @@ func TestReconcileControlPlaneMachineHealthCheck(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, } maxUnhealthy := intstr.Parse("45%") @@ -1926,6 +1934,7 @@ func TestReconcileControlPlaneMachineHealthCheck(t *testing.T) { mhcBuilder := builder.MachineHealthCheck(metav1.NamespaceDefault, "cp1"). WithSelector(*selectors.ForControlPlaneMHC()). WithUnhealthyNodeConditions(mhcClass.Checks.UnhealthyNodeConditions). + WithUnhealthyMachineConditions(mhcClass.Checks.UnhealthyMachineConditions). WithClusterName("cluster1") tests := []struct { @@ -3436,6 +3445,13 @@ func TestReconcileMachineDeploymentMachineHealthCheck(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }). + WithUnhealthyMachineConditions([]clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }). WithClusterName("cluster1") infrastructureMachineTemplate := builder.TestInfrastructureMachineTemplate(metav1.NamespaceDefault, "infrastructure-machine-1").Build() @@ -3886,6 +3902,13 @@ func TestReconciler_reconcileMachineHealthCheck(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }). + WithUnhealthyMachineConditions([]clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }). WithClusterName("cluster1") tests := []struct { name string @@ -3910,6 +3933,12 @@ func TestReconciler_reconcileMachineHealthCheck(t *testing.T) { Status: corev1.ConditionUnknown, TimeoutSeconds: ptr.To(int32(1000 * 60)), }, + }).WithUnhealthyMachineConditions([]clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(1000 * 60)), + }, }).Build(), want: mhcBuilder.DeepCopy().WithUnhealthyNodeConditions([]clusterv1.UnhealthyNodeCondition{ { @@ -3917,6 +3946,12 @@ func TestReconciler_reconcileMachineHealthCheck(t *testing.T) { Status: corev1.ConditionUnknown, TimeoutSeconds: ptr.To(int32(1000 * 60)), }, + }).WithUnhealthyMachineConditions([]clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(1000 * 60)), + }, }).Build(), }, { diff --git a/internal/webhooks/cluster_test.go b/internal/webhooks/cluster_test.go index ec114b445d01..e057223792d5 100644 --- a/internal/webhooks/cluster_test.go +++ b/internal/webhooks/cluster_test.go @@ -39,6 +39,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client/interceptor" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/internal/webhooks/util" @@ -2369,8 +2370,9 @@ func TestClusterTopologyValidationWithClient(t *testing.T) { WithControlPlaneReplicas(3). WithControlPlaneMachineHealthCheck(clusterv1.ControlPlaneTopologyHealthCheck{ Checks: clusterv1.ControlPlaneTopologyHealthCheckChecks{ - UnhealthyNodeConditions: []clusterv1.UnhealthyNodeCondition{}, - NodeStartupTimeoutSeconds: ptr.To(int32(30)), + UnhealthyNodeConditions: []clusterv1.UnhealthyNodeCondition{}, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{}, + NodeStartupTimeoutSeconds: ptr.To(int32(30)), }, }). Build()). @@ -2397,6 +2399,12 @@ func TestClusterTopologyValidationWithClient(t *testing.T) { Status: corev1.ConditionFalse, }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + }, + }, }, }). Build()). @@ -2429,6 +2437,13 @@ func TestClusterTopologyValidationWithClient(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build(), @@ -2452,6 +2467,12 @@ func TestClusterTopologyValidationWithClient(t *testing.T) { Status: corev1.ConditionFalse, }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + }, + }, }, }). Build()). @@ -2501,8 +2522,9 @@ func TestClusterTopologyValidationWithClient(t *testing.T) { WithClass("worker-class"). WithMachineHealthCheck(clusterv1.MachineDeploymentTopologyHealthCheck{ Checks: clusterv1.MachineDeploymentTopologyHealthCheckChecks{ - UnhealthyNodeConditions: []clusterv1.UnhealthyNodeCondition{}, - NodeStartupTimeoutSeconds: ptr.To(int32(30)), + UnhealthyNodeConditions: []clusterv1.UnhealthyNodeCondition{}, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{}, + NodeStartupTimeoutSeconds: ptr.To(int32(30)), }, }). Build(), @@ -2547,6 +2569,13 @@ func TestClusterTopologyValidationWithClient(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build(), @@ -2575,6 +2604,12 @@ func TestClusterTopologyValidationWithClient(t *testing.T) { Status: corev1.ConditionFalse, }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + }, + }, }, }). Build(), diff --git a/internal/webhooks/clusterclass_test.go b/internal/webhooks/clusterclass_test.go index 600fa24c304a..ab709b0b5a21 100644 --- a/internal/webhooks/clusterclass_test.go +++ b/internal/webhooks/clusterclass_test.go @@ -32,6 +32,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" + controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/api/core/v1beta2/index" "sigs.k8s.io/cluster-api/feature" @@ -700,6 +701,13 @@ func TestClusterClassValidation(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, NodeStartupTimeoutSeconds: ptr.To(int32(60)), }, }). @@ -764,6 +772,13 @@ func TestClusterClassValidation(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, NodeStartupTimeoutSeconds: ptr.To(int32(60)), }, }). @@ -793,7 +808,13 @@ func TestClusterClassValidation(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, - // nodeStartupTimeout is too short here. + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, NodeStartupTimeoutSeconds: ptr.To(int32(10)), }, }). @@ -2213,6 +2234,13 @@ func TestClusterClassValidationWithClusterAwareChecks(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build(), @@ -2249,6 +2277,13 @@ func TestClusterClassValidationWithClusterAwareChecks(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build(), @@ -2277,6 +2312,13 @@ func TestClusterClassValidationWithClusterAwareChecks(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build()). @@ -2297,6 +2339,13 @@ func TestClusterClassValidationWithClusterAwareChecks(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build(), @@ -2345,6 +2394,13 @@ func TestClusterClassValidationWithClusterAwareChecks(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build(), @@ -2400,6 +2456,13 @@ func TestClusterClassValidationWithClusterAwareChecks(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build(), @@ -2440,6 +2503,13 @@ func TestClusterClassValidationWithClusterAwareChecks(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build()). @@ -2467,6 +2537,13 @@ func TestClusterClassValidationWithClusterAwareChecks(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionUnknown, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + }, }, }). Build(), diff --git a/internal/webhooks/machinehealthcheck_test.go b/internal/webhooks/machinehealthcheck_test.go index 6fd395365c4c..35d7d598a738 100644 --- a/internal/webhooks/machinehealthcheck_test.go +++ b/internal/webhooks/machinehealthcheck_test.go @@ -25,6 +25,7 @@ import ( "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/utils/ptr" + controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/internal/webhooks/util" ) @@ -46,6 +47,12 @@ func TestMachineHealthCheckDefault(t *testing.T) { Status: corev1.ConditionFalse, }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + }, + }, }, }, } @@ -92,6 +99,12 @@ func TestMachineHealthCheckLabelSelectorAsSelectorValidation(t *testing.T) { Status: corev1.ConditionFalse, }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + }, + }, }, }, } @@ -156,6 +169,12 @@ func TestMachineHealthCheckClusterNameImmutable(t *testing.T) { Status: corev1.ConditionFalse, }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + }, + }, }, }, } @@ -174,6 +193,12 @@ func TestMachineHealthCheckClusterNameImmutable(t *testing.T) { Status: corev1.ConditionFalse, }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + }, + }, }, }, } @@ -253,6 +278,71 @@ func TestMachineHealthCheckUnhealthyNodeConditions(t *testing.T) { } } +func TestMachineHealthCheckUnhealthyMachineConditions(t *testing.T) { + tests := []struct { + name string + unhealthyMachineConditions []clusterv1.UnhealthyMachineCondition + expectErr bool + }{ + { + name: "pass with correctly defined unhealthyMachineConditions", + unhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + }, + }, + + expectErr: false, + }, + { + name: "do not fail if the UnhealthyMachineCondition array is nil", + unhealthyMachineConditions: nil, + expectErr: false, + }, + { + name: "do not fail if the UnhealthyMachineCondition array is nil", + unhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{}, + expectErr: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + mhc := &clusterv1.MachineHealthCheck{ + Spec: clusterv1.MachineHealthCheckSpec{ + Selector: metav1.LabelSelector{ + MatchLabels: map[string]string{ + "test": "test", + }, + }, + Checks: clusterv1.MachineHealthCheckChecks{ + UnhealthyMachineConditions: tt.unhealthyMachineConditions, + }, + }, + } + webhook := &MachineHealthCheck{} + + if tt.expectErr { + warnings, err := webhook.ValidateCreate(ctx, mhc) + g.Expect(err).To(HaveOccurred()) + g.Expect(warnings).To(BeEmpty()) + warnings, err = webhook.ValidateUpdate(ctx, mhc, mhc) + g.Expect(err).To(HaveOccurred()) + g.Expect(warnings).To(BeEmpty()) + } else { + warnings, err := webhook.ValidateCreate(ctx, mhc) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(warnings).To(BeEmpty()) + warnings, err = webhook.ValidateUpdate(ctx, mhc, mhc) + g.Expect(err).ToNot(HaveOccurred()) + g.Expect(warnings).To(BeEmpty()) + } + }) + } +} + func TestMachineHealthCheckNodeStartupTimeout(t *testing.T) { zero := int32(0) twentyNineSeconds := int32(29) @@ -315,6 +405,12 @@ func TestMachineHealthCheckNodeStartupTimeout(t *testing.T) { Status: corev1.ConditionFalse, }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + }, + }, }, }, } @@ -383,6 +479,12 @@ func TestMachineHealthCheckMaxUnhealthy(t *testing.T) { Status: corev1.ConditionFalse, }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + }, + }, }, Remediation: clusterv1.MachineHealthCheckRemediation{ TriggerIf: clusterv1.MachineHealthCheckRemediationTriggerIf{ @@ -422,6 +524,12 @@ func TestMachineHealthCheckSelectorValidation(t *testing.T) { Status: corev1.ConditionFalse, }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + }, + }, }, }, } @@ -450,6 +558,12 @@ func TestMachineHealthCheckClusterNameSelectorValidation(t *testing.T) { Status: corev1.ConditionFalse, }, }, + UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + }, + }, }, }, } diff --git a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml index 3bd890e2f495..74c4561765ac 100644 --- a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml +++ b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml @@ -3,6 +3,7 @@ # - a selector that targets all the machines with label cluster.x-k8s.io/control-plane="" and the mhc-test: "fail" (the label is used to trigger remediation in a controlled way - by adding CP under MHC control intentionally -) # - nodeStartupTimeoutSeconds: 30s (to force remediation on nodes still provisioning) # - unhealthyNodeConditions triggering remediation after 10s the e2e.remediation.condition condition is set to false (to force remediation on nodes already provisioned) +# - unhealthyMachineConditions triggering remediation after 10s the condition is set to false (to force remediation on machines already provisioned) apiVersion: cluster.x-k8s.io/v1beta2 kind: MachineHealthCheck metadata: @@ -19,6 +20,10 @@ spec: - type: e2e.remediation.condition status: "False" timeoutSeconds: 10 + unhealthyMachineConditions: + - type: "Ready" + status: "False" + timeoutSeconds: 10 remediation: triggerIf: unhealthyLessThanOrEqualTo: 100% diff --git a/test/e2e/data/infrastructure-docker/main/cluster-template-md-remediation/mhc.yaml b/test/e2e/data/infrastructure-docker/main/cluster-template-md-remediation/mhc.yaml index 1c65f920dc68..f5c9736b06a2 100644 --- a/test/e2e/data/infrastructure-docker/main/cluster-template-md-remediation/mhc.yaml +++ b/test/e2e/data/infrastructure-docker/main/cluster-template-md-remediation/mhc.yaml @@ -1,7 +1,7 @@ --- # MachineHealthCheck object with # - a selector that targets all the machines with label e2e.remediation.label="" -# - unhealthyNodeConditions triggering remediation after 10s the condition is set +# - unhealthyNodeConditions and unhealthyMachineConditions triggering remediation after 10s the condition is set apiVersion: cluster.x-k8s.io/v1beta2 kind: MachineHealthCheck metadata: @@ -16,6 +16,10 @@ spec: - type: e2e.remediation.condition status: "False" timeoutSeconds: 10 + unhealthyMachineConditions: + - type: "Ready" + status: "False" + timeoutSeconds: 10 remediation: triggerIf: unhealthyLessThanOrEqualTo: 100% diff --git a/test/e2e/data/infrastructure-docker/main/clusterclass-quick-start.yaml b/test/e2e/data/infrastructure-docker/main/clusterclass-quick-start.yaml index 48b41b1508d2..70bf9231d2b2 100644 --- a/test/e2e/data/infrastructure-docker/main/clusterclass-quick-start.yaml +++ b/test/e2e/data/infrastructure-docker/main/clusterclass-quick-start.yaml @@ -24,6 +24,10 @@ spec: - type: e2e.remediation.condition status: "False" timeoutSeconds: 20 + unhealthyMachineConditions: + - type: "Ready" + status: "False" + timeoutSeconds: 20 infrastructure: templateRef: apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 @@ -47,7 +51,7 @@ spec: apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 kind: DockerMachineTemplate name: quick-start-default-worker-machinetemplate - # We are intentionally not setting the 'unhealthyNodeConditions' here to test that the field is optional. + # We are intentionally not setting the 'unhealthyNodeConditions and unhealthyMachineConditions' here to test that those fields are optional. machinePools: - class: default-worker metadata: diff --git a/test/e2e/data/infrastructure-docker/v1.11/clusterclass-quick-start.yaml b/test/e2e/data/infrastructure-docker/v1.11/clusterclass-quick-start.yaml index 48b41b1508d2..e747a5918951 100644 --- a/test/e2e/data/infrastructure-docker/v1.11/clusterclass-quick-start.yaml +++ b/test/e2e/data/infrastructure-docker/v1.11/clusterclass-quick-start.yaml @@ -24,6 +24,10 @@ spec: - type: e2e.remediation.condition status: "False" timeoutSeconds: 20 + unhealthyMachineConditions: + - type: "Ready" + status: "False" + timeoutSeconds: 20 infrastructure: templateRef: apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 @@ -47,7 +51,7 @@ spec: apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 kind: DockerMachineTemplate name: quick-start-default-worker-machinetemplate - # We are intentionally not setting the 'unhealthyNodeConditions' here to test that the field is optional. + # We are intentionally not setting the 'unhealthyNodeConditions and unhealthyMachineConditions' here to test that those fields are optional. machinePools: - class: default-worker metadata: diff --git a/test/e2e/kcp_remediations.go b/test/e2e/kcp_remediations.go index fa80f9cbd871..0edccd668bad 100644 --- a/test/e2e/kcp_remediations.go +++ b/test/e2e/kcp_remediations.go @@ -79,6 +79,10 @@ type KCPRemediationSpecInput struct { // - type: e2e.remediation.condition // status: "False" // timeout: 10s + // unhealthyMachineConditions: + // - type: "Ready" + // status: "False" + // timeoutSeconds: 10 // If not specified, "kcp-remediation" is used. Flavor *string diff --git a/test/framework/machine_helpers.go b/test/framework/machine_helpers.go index 75d3364417ab..f3018594db9d 100644 --- a/test/framework/machine_helpers.go +++ b/test/framework/machine_helpers.go @@ -22,6 +22,7 @@ import ( . "github.com/onsi/gomega" "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" @@ -251,6 +252,48 @@ func PatchNodeCondition(ctx context.Context, input PatchNodeConditionInput) { }, retryableOperationTimeout, retryableOperationInterval).Should(Succeed(), "Failed to patch node %s", input.Machine.Status.NodeRef.Name) } +type PatchMachineConditionInput struct { + ClusterProxy ClusterProxy + Cluster *clusterv1.Cluster + Machine *clusterv1.Machine + MachineCondition clusterv1.Condition +} + +// PatchMachineCondition patches a Machine resource's status with a given condition +// to simulate an unhealthy machine for MHC testing. +func PatchMachineCondition(ctx context.Context, input PatchMachineConditionInput) { + Expect(ctx).NotTo(BeNil(), "ctx is required for PatchMachineCondition") + Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling PatchMachineCondition") + Expect(input.Cluster).ToNot(BeNil(), "Invalid argument. input.Cluster can't be nil when calling PatchMachineCondition") + Expect(input.Machine).ToNot(BeNil(), "Invalid argument. input.Machine can't be nil when calling PatchMachineCondition") + + machine := &clusterv1.Machine{} + Eventually(func() error { + return input.ClusterProxy.GetClient().Get(ctx, types.NamespacedName{ + Namespace: input.Machine.Namespace, + Name: input.Machine.Name, + }, machine) + }, retryableOperationTimeout, retryableOperationInterval).Should(Succeed(), "Failed to get Machine %s", input.Machine.Name) + + patchHelper, err := patch.NewHelper(machine, input.ClusterProxy.GetClient()) + Expect(err).ToNot(HaveOccurred()) + + // Convert clusterv1.Condition -> metav1.Condition + cond := metav1.Condition{ + Type: string(input.MachineCondition.Type), + Status: metav1.ConditionStatus(input.MachineCondition.Status), + LastTransitionTime: input.MachineCondition.LastTransitionTime, + Reason: input.MachineCondition.Reason, + Message: input.MachineCondition.Message, + } + + machine.Status.Conditions = append(machine.Status.Conditions, cond) + + Eventually(func() error { + return patchHelper.Patch(ctx, machine) + }, retryableOperationTimeout, retryableOperationInterval).Should(Succeed(), "Failed to patch Machine %s", input.Machine.Name) +} + // MachineStatusCheck is a type that operates a status check on a Machine. type MachineStatusCheck func(p *clusterv1.Machine) error diff --git a/test/framework/machinehealthcheck_helpers.go b/test/framework/machinehealthcheck_helpers.go index 1a147d4afcc6..35622708f9db 100644 --- a/test/framework/machinehealthcheck_helpers.go +++ b/test/framework/machinehealthcheck_helpers.go @@ -57,6 +57,7 @@ func DiscoverMachineHealthChecksAndWaitForRemediation(ctx context.Context, input for _, mhc := range machineHealthChecks { Expect(mhc.Spec.Checks.UnhealthyNodeConditions).NotTo(BeEmpty()) + Expect(mhc.Spec.Checks.UnhealthyMachineConditions).NotTo(BeEmpty()) fmt.Fprintln(GinkgoWriter, "Ensuring there is at least 1 Machine that MachineHealthCheck is matching") machines := GetMachinesByMachineHealthCheck(ctx, GetMachinesByMachineHealthCheckInput{ @@ -80,6 +81,21 @@ func DiscoverMachineHealthChecksAndWaitForRemediation(ctx context.Context, input Machine: machines[0], }) + fmt.Fprintln(GinkgoWriter, "Patching MachineHealthCheck unhealthy machine condition to one of the machines") + unhealthyMachineCondition := clusterv1.Condition{ + Type: clusterv1.ConditionType(mhc.Spec.Checks.UnhealthyMachineConditions[0].Type), + Status: corev1.ConditionStatus(mhc.Spec.Checks.UnhealthyMachineConditions[0].Status), + + LastTransitionTime: metav1.Time{Time: time.Now()}, + } + + PatchMachineCondition(ctx, PatchMachineConditionInput{ + ClusterProxy: input.ClusterProxy, + Cluster: input.Cluster, + Machine: machines[0].DeepCopy(), + MachineCondition: unhealthyMachineCondition, + }) + fmt.Fprintln(GinkgoWriter, "Waiting for remediation") WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition(ctx, WaitForMachineHealthCheckToRemediateUnhealthyNodeConditionInput{ ClusterProxy: input.ClusterProxy, @@ -87,6 +103,14 @@ func DiscoverMachineHealthChecksAndWaitForRemediation(ctx context.Context, input MachineHealthCheck: mhc, MachinesCount: len(machines), }, input.WaitForMachineRemediation...) + + fmt.Fprintln(GinkgoWriter, "Waiting for remediation of unhealthy machine condition") + WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition(ctx, WaitForMachineHealthCheckToRemediateUnhealthyNodeConditionInput{ + ClusterProxy: input.ClusterProxy, + Cluster: input.Cluster, + MachineHealthCheck: mhc, + MachinesCount: len(machines), + }, input.WaitForMachineRemediation...) } } @@ -168,6 +192,35 @@ func WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition(ctx context.Cont }, intervals...).Should(BeTrue()) } +// WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition patches a machine condition to simulate unhealthiness and waits for remediation. +func WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition(ctx context.Context, input WaitForMachineHealthCheckToRemediateUnhealthyNodeConditionInput, intervals ...interface{}) { + Expect(ctx).NotTo(BeNil(), "ctx is required for WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition") + Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition") + Expect(input.Cluster).ToNot(BeNil(), "Invalid argument. input.Cluster can't be nil when calling WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition") + Expect(input.MachineHealthCheck).NotTo(BeNil(), "Invalid argument. input.MachineHealthCheck can't be nil when calling WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition") + Expect(input.MachinesCount).NotTo(BeZero(), "Invalid argument. input.MachinesCount can't be zero when calling WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition") + + fmt.Fprintln(GinkgoWriter, "Waiting until the machine with unhealthy machine condition is remediated") + + Eventually(func() bool { + machines := GetMachinesByMachineHealthCheck(ctx, GetMachinesByMachineHealthCheckInput{ + Lister: input.ClusterProxy.GetClient(), + ClusterName: input.Cluster.Name, + MachineHealthCheck: input.MachineHealthCheck, + }) + if len(machines) < input.MachinesCount { + return false + } + + for _, machine := range machines { + if hasMatchingUnhealthyMachineConditions(input.MachineHealthCheck, machine.Status.Conditions) { + return false + } + } + return true + }, intervals...).Should(BeTrue()) +} + // hasMatchingUnhealthyNodeConditions returns true if any node condition matches with machine health check unhealthy conditions. func hasMatchingUnhealthyNodeConditions(machineHealthCheck *clusterv1.MachineHealthCheck, nodeConditions []corev1.NodeCondition) bool { for _, unhealthyNodeCondition := range machineHealthCheck.Spec.Checks.UnhealthyNodeConditions { @@ -179,3 +232,15 @@ func hasMatchingUnhealthyNodeConditions(machineHealthCheck *clusterv1.MachineHea } return false } + +func hasMatchingUnhealthyMachineConditions(machineHealthCheck *clusterv1.MachineHealthCheck, machineConditions []metav1.Condition) bool { + for _, unhealthyMachineCondition := range machineHealthCheck.Spec.Checks.UnhealthyMachineConditions { + for _, machineCondition := range machineConditions { + if machineCondition.Type == unhealthyMachineCondition.Type && + machineCondition.Status == unhealthyMachineCondition.Status { + return true + } + } + } + return false +} diff --git a/test/infrastructure/docker/templates/clusterclass-in-memory.yaml b/test/infrastructure/docker/templates/clusterclass-in-memory.yaml index de31343dc479..f6903476e7cc 100644 --- a/test/infrastructure/docker/templates/clusterclass-in-memory.yaml +++ b/test/infrastructure/docker/templates/clusterclass-in-memory.yaml @@ -126,6 +126,13 @@ spec: - type: Ready status: "False" timeoutSeconds: 300 + unhealthyMachineConditions: + - type: "Ready" + status: Unknown + timeoutSeconds: 300 + - type: "Ready" + status: "False" + timeoutSeconds: 300 infrastructure: templateRef: apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 @@ -153,3 +160,10 @@ spec: - type: Ready status: "False" timeoutSeconds: 300 + unhealthyMachineConditions: + - type: "Ready" + status: Unknown + timeoutSeconds: 300 + - type: "Ready" + status: "False" + timeoutSeconds: 300 diff --git a/test/infrastructure/docker/templates/clusterclass-quick-start.yaml b/test/infrastructure/docker/templates/clusterclass-quick-start.yaml index ea82a08d06c5..15d7897087d1 100644 --- a/test/infrastructure/docker/templates/clusterclass-quick-start.yaml +++ b/test/infrastructure/docker/templates/clusterclass-quick-start.yaml @@ -22,6 +22,13 @@ spec: - type: Ready status: "False" timeoutSeconds: 300 + unhealthyMachineConditions: + - type: "Ready" + status: Unknown + timeoutSeconds: 300 + - type: "Ready" + status: "False" + timeoutSeconds: 300 infrastructure: templateRef: apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 @@ -49,6 +56,13 @@ spec: - type: Ready status: "False" timeoutSeconds: 300 + unhealthyMachineConditions: + - type: "Ready" + status: Unknown + timeoutSeconds: 300 + - type: "Ready" + status: "False" + timeoutSeconds: 300 machinePools: - class: default-worker bootstrap: diff --git a/util/test/builder/builders.go b/util/test/builder/builders.go index 28777cfc7f51..417620e2c210 100644 --- a/util/test/builder/builders.go +++ b/util/test/builder/builders.go @@ -2087,13 +2087,14 @@ func setStatusFields(obj *unstructured.Unstructured, fields map[string]interface // MachineHealthCheckBuilder holds fields for creating a MachineHealthCheck. type MachineHealthCheckBuilder struct { - name string - namespace string - ownerRefs []metav1.OwnerReference - selector metav1.LabelSelector - clusterName string - unhealthyNodeConditions []clusterv1.UnhealthyNodeCondition - maxUnhealthy *intstr.IntOrString + name string + namespace string + ownerRefs []metav1.OwnerReference + selector metav1.LabelSelector + clusterName string + unhealthyNodeConditions []clusterv1.UnhealthyNodeCondition + unhealthyMachineConditions []clusterv1.UnhealthyMachineCondition + maxUnhealthy *intstr.IntOrString } // MachineHealthCheck returns a MachineHealthCheckBuilder with the given name and namespace. @@ -2122,6 +2123,12 @@ func (m *MachineHealthCheckBuilder) WithUnhealthyNodeConditions(conditions []clu return m } +// WithUnhealthyMachineConditions adds the spec used to build the parameters of the MachineHealthCheck. +func (m *MachineHealthCheckBuilder) WithUnhealthyMachineConditions(conditions []clusterv1.UnhealthyMachineCondition) *MachineHealthCheckBuilder { + m.unhealthyMachineConditions = conditions + return m +} + // WithOwnerReferences adds ownerreferences for the MachineHealthCheck. func (m *MachineHealthCheckBuilder) WithOwnerReferences(ownerRefs []metav1.OwnerReference) *MachineHealthCheckBuilder { m.ownerRefs = ownerRefs @@ -2147,7 +2154,8 @@ func (m *MachineHealthCheckBuilder) Build() *clusterv1.MachineHealthCheck { ClusterName: m.clusterName, Selector: m.selector, Checks: clusterv1.MachineHealthCheckChecks{ - UnhealthyNodeConditions: m.unhealthyNodeConditions, + UnhealthyNodeConditions: m.unhealthyNodeConditions, + UnhealthyMachineConditions: m.unhealthyMachineConditions, }, Remediation: clusterv1.MachineHealthCheckRemediation{ TriggerIf: clusterv1.MachineHealthCheckRemediationTriggerIf{ diff --git a/util/test/builder/zz_generated.deepcopy.go b/util/test/builder/zz_generated.deepcopy.go index 3f5cf4a6c67f..432e10925d46 100644 --- a/util/test/builder/zz_generated.deepcopy.go +++ b/util/test/builder/zz_generated.deepcopy.go @@ -630,6 +630,13 @@ func (in *MachineHealthCheckBuilder) DeepCopyInto(out *MachineHealthCheckBuilder (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.unhealthyMachineConditions != nil { + in, out := &in.unhealthyMachineConditions, &out.unhealthyMachineConditions + *out = make([]v1beta2.UnhealthyMachineCondition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } if in.maxUnhealthy != nil { in, out := &in.maxUnhealthy, &out.maxUnhealthy *out = new(intstr.IntOrString) From c6a7148120943d5c2c6fc2e35f7fe4640753eb3f Mon Sep 17 00:00:00 2001 From: Furkat Gofurov Date: Wed, 8 Oct 2025 01:07:47 +0300 Subject: [PATCH 2/8] Fix PR check Markdown links CI Signed-off-by: Furkat Gofurov --- .../experimental-features/cluster-class/write-clusterclass.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/book/src/tasks/experimental-features/cluster-class/write-clusterclass.md b/docs/book/src/tasks/experimental-features/cluster-class/write-clusterclass.md index 7a9ad1ae21de..347ad38f85e2 100644 --- a/docs/book/src/tasks/experimental-features/cluster-class/write-clusterclass.md +++ b/docs/book/src/tasks/experimental-features/cluster-class/write-clusterclass.md @@ -246,7 +246,7 @@ in `KubeadmControlPlane`. Use cases like this can be implemented with ClusterCla The following example shows how variables can be defined in the ClusterClass. A variable definition specifies the name and the schema of a variable and if it is required. The schema defines how a variable is defaulted and validated. It supports -a subset of the schema of CRDs. For more information please see the [godoc](https://doc.crds.dev/github.com/kubernetes-sigs/cluster-api/cluster.x-k8s.io/ClusterClass/v1beta1#spec-variables-schema-openAPIV3Schema). +a subset of the schema of CRDs. For more information please see the [godoc](https://doc.crds.dev/github.com/kubernetes-sigs/cluster-api/cluster.x-k8s.io/ClusterClass/v1beta2#spec-variables-schema-openAPIV3Schema). ```yaml apiVersion: cluster.x-k8s.io/v1beta2 @@ -281,7 +281,7 @@ The variable can then be used in a patch to set a field on a template referenced The `selector` specifies on which template the patch should be applied. `jsonPatches` specifies which JSON patches should be applied to that template. In this case we set the `imageRepository` field of the `KubeadmControlPlaneTemplate` to the value of the variable `imageRepository`. For more information -please see the [godoc](https://doc.crds.dev/github.com/kubernetes-sigs/cluster-api/cluster.x-k8s.io/ClusterClass/v1beta1#spec-patches-definitions). +please see the [godoc](https://doc.crds.dev/github.com/kubernetes-sigs/cluster-api/cluster.x-k8s.io/ClusterClass/v1beta2#spec-patches-definitions). ```yaml apiVersion: cluster.x-k8s.io/v1beta2 From 5ee7d25eba8cdb949491f17f92c6cc651da16f09 Mon Sep 17 00:00:00 2001 From: Furkat Gofurov Date: Mon, 13 Oct 2025 20:59:56 +0300 Subject: [PATCH 3/8] Address review comments Signed-off-by: Furkat Gofurov --- api/core/v1beta1/conversion.go | 12 ++++ api/core/v1beta1/conversion_test.go | 47 -------------- api/core/v1beta2/machinehealthcheck_types.go | 6 +- api/core/v1beta2/zz_generated.openapi.go | 4 +- .../cluster.x-k8s.io_clusterclasses.yaml | 10 +-- .../crd/bases/cluster.x-k8s.io_clusters.yaml | 10 +-- .../cluster.x-k8s.io_machinehealthchecks.yaml | 5 +- internal/api/core/v1alpha3/conversion.go | 9 +++ internal/api/core/v1alpha4/conversion.go | 12 ++++ .../machinehealthcheck_controller_test.go | 3 +- .../machinehealthcheck_targets.go | 4 +- .../machinehealthcheck_targets_test.go | 2 +- internal/webhooks/clusterclass_test.go | 1 + .../cluster-template-kcp-remediation/mhc.yaml | 5 -- .../cluster-template-md-remediation/mhc.yaml | 6 +- .../main/clusterclass-quick-start.yaml | 4 +- .../v1.11/clusterclass-quick-start.yaml | 6 +- test/e2e/kcp_remediations.go | 4 -- test/framework/machine_helpers.go | 43 ------------ test/framework/machinehealthcheck_helpers.go | 65 ------------------- 20 files changed, 63 insertions(+), 195 deletions(-) diff --git a/api/core/v1beta1/conversion.go b/api/core/v1beta1/conversion.go index 9f67740c47ac..b7d2a57dc148 100644 --- a/api/core/v1beta1/conversion.go +++ b/api/core/v1beta1/conversion.go @@ -73,6 +73,11 @@ func (src *Cluster) ConvertTo(dstRaw conversion.Hub) error { return err } + dst.Spec.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = restored.Spec.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions + for i, md := range restored.Spec.Topology.Workers.MachineDeployments { + dst.Spec.Topology.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = md.HealthCheck.Checks.UnhealthyMachineConditions + } + // Recover intent for bool values converted to *bool. clusterv1.Convert_bool_To_Pointer_bool(src.Spec.Paused, ok, restored.Spec.Paused, &dst.Spec.Paused) @@ -145,6 +150,11 @@ func (src *ClusterClass) ConvertTo(dstRaw conversion.Hub) error { return err } + dst.Spec.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = restored.Spec.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions + for i, md := range restored.Spec.Workers.MachineDeployments { + dst.Spec.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = md.HealthCheck.Checks.UnhealthyMachineConditions + } + // Recover intent for bool values converted to *bool. for i, patch := range dst.Spec.Patches { for j, definition := range patch.Definitions { @@ -513,6 +523,8 @@ func (src *MachineHealthCheck) ConvertTo(dstRaw conversion.Hub) error { return err } + dst.Spec.Checks.UnhealthyMachineConditions = restored.Spec.Checks.UnhealthyMachineConditions + clusterv1.Convert_int32_To_Pointer_int32(src.Status.ExpectedMachines, ok, restored.Status.ExpectedMachines, &dst.Status.ExpectedMachines) clusterv1.Convert_int32_To_Pointer_int32(src.Status.CurrentHealthy, ok, restored.Status.CurrentHealthy, &dst.Status.CurrentHealthy) clusterv1.Convert_int32_To_Pointer_int32(src.Status.RemediationsAllowed, ok, restored.Status.RemediationsAllowed, &dst.Status.RemediationsAllowed) diff --git a/api/core/v1beta1/conversion_test.go b/api/core/v1beta1/conversion_test.go index a0dbc5f2fdba..10d557048460 100644 --- a/api/core/v1beta1/conversion_test.go +++ b/api/core/v1beta1/conversion_test.go @@ -99,7 +99,6 @@ func ClusterFuzzFuncs(_ runtimeserializer.CodecFactory) []interface{} { hubClusterVariable, hubFailureDomain, hubUnhealthyNodeCondition, - hubUnhealthyMachineCondition, spokeCluster, spokeClusterTopology, spokeObjectReference, @@ -127,17 +126,6 @@ func hubClusterSpec(in *clusterv1.ClusterSpec, c randfill.Continue) { in.ControlPlaneRef.APIGroup = gvk.Group in.ControlPlaneRef.Kind = gvk.Kind } - - // remove MachineHealthCheck.UnhealthyMachineConditions as it does not exist in v1beta1. - if in.Topology.IsDefined() && in.Topology.ControlPlane.HealthCheck.IsDefined() { - in.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = nil - } - - if in.Topology.IsDefined() && len(in.Topology.Workers.MachineDeployments) > 0 { - for i := range in.Topology.Workers.MachineDeployments { - in.Topology.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = nil - } - } } func hubClusterStatus(in *clusterv1.ClusterStatus, c randfill.Continue) { @@ -189,14 +177,6 @@ func hubUnhealthyNodeCondition(in *clusterv1.UnhealthyNodeCondition, c randfill. } } -func hubUnhealthyMachineCondition(in *clusterv1.UnhealthyMachineCondition, c randfill.Continue) { - c.FillNoCustom(in) - - if in.TimeoutSeconds == nil { - in.TimeoutSeconds = ptr.To(int32(0)) // TimeoutSeconds is a required field and nil does not round trip - } -} - func spokeCluster(in *Cluster, c randfill.Continue) { c.FillNoCustom(in) @@ -287,14 +267,12 @@ func spokeClusterVariable(in *ClusterVariable, c randfill.Continue) { func ClusterClassFuncs(_ runtimeserializer.CodecFactory) []interface{} { return []interface{}{ - hubClusterClassSpec, hubClusterClassVariable, hubClusterClassStatusVariableDefinition, hubClusterClassStatus, hubJSONPatch, hubJSONSchemaProps, hubUnhealthyNodeCondition, - hubUnhealthyMachineCondition, spokeClusterClass, spokeObjectReference, spokeClusterClassStatus, @@ -309,21 +287,6 @@ func ClusterClassFuncs(_ runtimeserializer.CodecFactory) []interface{} { } } -func hubClusterClassSpec(in *clusterv1.ClusterClassSpec, c randfill.Continue) { - c.FillNoCustom(in) - - // remove MachineHealthCheck.UnhealthyMachineConditions as it does not exist in v1beta1. - if in.ControlPlane.HealthCheck.IsDefined() && in.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions != nil { - in.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = nil - } - - if len(in.Workers.MachineDeployments) > 0 { - for i := range in.Workers.MachineDeployments { - in.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = nil - } - } -} - func hubClusterClassVariable(in *clusterv1.ClusterClassVariable, c randfill.Continue) { c.FillNoCustom(in) @@ -765,9 +728,7 @@ func spokeMachineDeploymentStatus(in *MachineDeploymentStatus, c randfill.Contin func MachineHealthCheckFuzzFuncs(_ runtimeserializer.CodecFactory) []interface{} { return []interface{}{ hubUnhealthyNodeCondition, - hubUnhealthyMachineCondition, hubMachineHealthCheckStatus, - hubMachineHealthCheckSpec, spokeMachineHealthCheck, spokeMachineHealthCheckSpec, spokeObjectReference, @@ -776,14 +737,6 @@ func MachineHealthCheckFuzzFuncs(_ runtimeserializer.CodecFactory) []interface{} } } -func hubMachineHealthCheckSpec(in *clusterv1.MachineHealthCheckSpec, c randfill.Continue) { - c.FillNoCustom(in) - - if in.Checks.UnhealthyMachineConditions != nil { - in.Checks.UnhealthyMachineConditions = nil - } -} - func hubMachineHealthCheckStatus(in *clusterv1.MachineHealthCheckStatus, c randfill.Continue) { c.FillNoCustom(in) // Drop empty structs with only omit empty fields. diff --git a/api/core/v1beta2/machinehealthcheck_types.go b/api/core/v1beta2/machinehealthcheck_types.go index f3e7aa1767f7..6aa54feff092 100644 --- a/api/core/v1beta2/machinehealthcheck_types.go +++ b/api/core/v1beta2/machinehealthcheck_types.go @@ -237,7 +237,7 @@ type UnhealthyNodeCondition struct { // timeoutSeconds is the duration that a node must be in a given status for, // after which the node is considered unhealthy. - // For example, with a value of "1h", the node must match the status + // For example, with a value of "3600", the node must match the status // for at least 1 hour before being considered unhealthy. // +required // +kubebuilder:validation:Minimum=0 @@ -249,7 +249,7 @@ type UnhealthyNodeCondition struct { // status for at least the timeout value, a machine is considered unhealthy. type UnhealthyMachineCondition struct { // type of Machine condition - // +kubebuilder:validation:Type=string + // +kubebuilder:validation:Pattern=`^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$` // +kubebuilder:validation:MinLength=1 // +kubebuilder:validation:MaxLength=316 // +required @@ -262,7 +262,7 @@ type UnhealthyMachineCondition struct { // timeoutSeconds is the duration that a machine must be in a given status for, // after which the machine is considered unhealthy. - // For example, with a value of "1h", the machine must match the status + // For example, with a value of "3600", the machine must match the status // for at least 1 hour before being considered unhealthy. // +required // +kubebuilder:validation:Minimum=0 diff --git a/api/core/v1beta2/zz_generated.openapi.go b/api/core/v1beta2/zz_generated.openapi.go index 65aafa05e781..c9bdfb70a604 100644 --- a/api/core/v1beta2/zz_generated.openapi.go +++ b/api/core/v1beta2/zz_generated.openapi.go @@ -6708,7 +6708,7 @@ func schema_cluster_api_api_core_v1beta2_UnhealthyMachineCondition(ref common.Re }, "timeoutSeconds": { SchemaProps: spec.SchemaProps{ - Description: "timeoutSeconds is the duration that a machine must be in a given status for, after which the machine is considered unhealthy. For example, with a value of \"1h\", the machine must match the status for at least 1 hour before being considered unhealthy.", + Description: "timeoutSeconds is the duration that a machine must be in a given status for, after which the machine is considered unhealthy. For example, with a value of \"3600\", the machine must match the status for at least 1 hour before being considered unhealthy.", Type: []string{"integer"}, Format: "int32", }, @@ -6743,7 +6743,7 @@ func schema_cluster_api_api_core_v1beta2_UnhealthyNodeCondition(ref common.Refer }, "timeoutSeconds": { SchemaProps: spec.SchemaProps{ - Description: "timeoutSeconds is the duration that a node must be in a given status for, after which the node is considered unhealthy. For example, with a value of \"1h\", the node must match the status for at least 1 hour before being considered unhealthy.", + Description: "timeoutSeconds is the duration that a node must be in a given status for, after which the node is considered unhealthy. For example, with a value of \"3600\", the node must match the status for at least 1 hour before being considered unhealthy.", Type: []string{"integer"}, Format: "int32", }, diff --git a/config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml b/config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml index c4b83e79fcaa..e820a7a57b39 100644 --- a/config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml +++ b/config/crd/bases/cluster.x-k8s.io_clusterclasses.yaml @@ -2992,7 +2992,7 @@ spec: description: |- timeoutSeconds is the duration that a machine must be in a given status for, after which the machine is considered unhealthy. - For example, with a value of "1h", the machine must match the status + For example, with a value of "3600", the machine must match the status for at least 1 hour before being considered unhealthy. format: int32 minimum: 0 @@ -3001,6 +3001,7 @@ spec: description: type of Machine condition maxLength: 316 minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string required: - status @@ -3031,7 +3032,7 @@ spec: description: |- timeoutSeconds is the duration that a node must be in a given status for, after which the node is considered unhealthy. - For example, with a value of "1h", the node must match the status + For example, with a value of "3600", the node must match the status for at least 1 hour before being considered unhealthy. format: int32 minimum: 0 @@ -4211,7 +4212,7 @@ spec: description: |- timeoutSeconds is the duration that a machine must be in a given status for, after which the machine is considered unhealthy. - For example, with a value of "1h", the machine must match the status + For example, with a value of "3600", the machine must match the status for at least 1 hour before being considered unhealthy. format: int32 minimum: 0 @@ -4220,6 +4221,7 @@ spec: description: type of Machine condition maxLength: 316 minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string required: - status @@ -4250,7 +4252,7 @@ spec: description: |- timeoutSeconds is the duration that a node must be in a given status for, after which the node is considered unhealthy. - For example, with a value of "1h", the node must match the status + For example, with a value of "3600", the node must match the status for at least 1 hour before being considered unhealthy. format: int32 minimum: 0 diff --git a/config/crd/bases/cluster.x-k8s.io_clusters.yaml b/config/crd/bases/cluster.x-k8s.io_clusters.yaml index 124f314076c2..276b0d78786f 100644 --- a/config/crd/bases/cluster.x-k8s.io_clusters.yaml +++ b/config/crd/bases/cluster.x-k8s.io_clusters.yaml @@ -2554,7 +2554,7 @@ spec: description: |- timeoutSeconds is the duration that a machine must be in a given status for, after which the machine is considered unhealthy. - For example, with a value of "1h", the machine must match the status + For example, with a value of "3600", the machine must match the status for at least 1 hour before being considered unhealthy. format: int32 minimum: 0 @@ -2563,6 +2563,7 @@ spec: description: type of Machine condition maxLength: 316 minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string required: - status @@ -2593,7 +2594,7 @@ spec: description: |- timeoutSeconds is the duration that a node must be in a given status for, after which the node is considered unhealthy. - For example, with a value of "1h", the node must match the status + For example, with a value of "3600", the node must match the status for at least 1 hour before being considered unhealthy. format: int32 minimum: 0 @@ -2997,7 +2998,7 @@ spec: description: |- timeoutSeconds is the duration that a machine must be in a given status for, after which the machine is considered unhealthy. - For example, with a value of "1h", the machine must match the status + For example, with a value of "3600", the machine must match the status for at least 1 hour before being considered unhealthy. format: int32 minimum: 0 @@ -3006,6 +3007,7 @@ spec: description: type of Machine condition maxLength: 316 minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string required: - status @@ -3036,7 +3038,7 @@ spec: description: |- timeoutSeconds is the duration that a node must be in a given status for, after which the node is considered unhealthy. - For example, with a value of "1h", the node must match the status + For example, with a value of "3600", the node must match the status for at least 1 hour before being considered unhealthy. format: int32 minimum: 0 diff --git a/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml b/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml index 559cab242b23..194a14be80e2 100644 --- a/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml +++ b/config/crd/bases/cluster.x-k8s.io_machinehealthchecks.yaml @@ -1088,7 +1088,7 @@ spec: description: |- timeoutSeconds is the duration that a machine must be in a given status for, after which the machine is considered unhealthy. - For example, with a value of "1h", the machine must match the status + For example, with a value of "3600", the machine must match the status for at least 1 hour before being considered unhealthy. format: int32 minimum: 0 @@ -1097,6 +1097,7 @@ spec: description: type of Machine condition maxLength: 316 minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string required: - status @@ -1127,7 +1128,7 @@ spec: description: |- timeoutSeconds is the duration that a node must be in a given status for, after which the node is considered unhealthy. - For example, with a value of "1h", the node must match the status + For example, with a value of "3600", the node must match the status for at least 1 hour before being considered unhealthy. format: int32 minimum: 0 diff --git a/internal/api/core/v1alpha3/conversion.go b/internal/api/core/v1alpha3/conversion.go index 8cda6f6a25f1..3a7af0830444 100644 --- a/internal/api/core/v1alpha3/conversion.go +++ b/internal/api/core/v1alpha3/conversion.go @@ -98,6 +98,13 @@ func (src *Cluster) ConvertTo(dstRaw conversion.Hub) error { if err != nil { return err } + // Recover unhealthyMachineConditions for control plane and machine deployments + dst.Spec.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = restored.Spec.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions + for i, md := range restored.Spec.Topology.Workers.MachineDeployments { + if i < len(dst.Spec.Topology.Workers.MachineDeployments) { + dst.Spec.Topology.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = md.HealthCheck.Checks.UnhealthyMachineConditions + } + } // Recover intent for bool values converted to *bool. clusterv1.Convert_bool_To_Pointer_bool(src.Spec.Paused, ok, restored.Spec.Paused, &dst.Spec.Paused) @@ -503,6 +510,8 @@ func (src *MachineHealthCheck) ConvertTo(dstRaw conversion.Hub) error { return err } + dst.Spec.Checks.UnhealthyMachineConditions = restored.Spec.Checks.UnhealthyMachineConditions + clusterv1.Convert_int32_To_Pointer_int32(src.Status.ExpectedMachines, ok, restored.Status.ExpectedMachines, &dst.Status.ExpectedMachines) clusterv1.Convert_int32_To_Pointer_int32(src.Status.CurrentHealthy, ok, restored.Status.CurrentHealthy, &dst.Status.CurrentHealthy) clusterv1.Convert_int32_To_Pointer_int32(src.Status.RemediationsAllowed, ok, restored.Status.RemediationsAllowed, &dst.Status.RemediationsAllowed) diff --git a/internal/api/core/v1alpha4/conversion.go b/internal/api/core/v1alpha4/conversion.go index 57ac08a4ea98..34177a9a6027 100644 --- a/internal/api/core/v1alpha4/conversion.go +++ b/internal/api/core/v1alpha4/conversion.go @@ -92,6 +92,11 @@ func (src *Cluster) ConvertTo(dstRaw conversion.Hub) error { return err } + dst.Spec.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = restored.Spec.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions + for i, md := range restored.Spec.Topology.Workers.MachineDeployments { + dst.Spec.Topology.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = md.HealthCheck.Checks.UnhealthyMachineConditions + } + // Recover intent for bool values converted to *bool. clusterv1.Convert_bool_To_Pointer_bool(src.Spec.Paused, ok, restored.Spec.Paused, &dst.Spec.Paused) @@ -237,6 +242,11 @@ func (src *ClusterClass) ConvertTo(dstRaw conversion.Hub) error { return err } + dst.Spec.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = restored.Spec.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions + for i, md := range restored.Spec.Workers.MachineDeployments { + dst.Spec.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = md.HealthCheck.Checks.UnhealthyMachineConditions + } + dst.Spec.Patches = restored.Spec.Patches dst.Spec.Variables = restored.Spec.Variables dst.Spec.AvailabilityGates = restored.Spec.AvailabilityGates @@ -593,6 +603,8 @@ func (src *MachineHealthCheck) ConvertTo(dstRaw conversion.Hub) error { return err } + dst.Spec.Checks.UnhealthyMachineConditions = restored.Spec.Checks.UnhealthyMachineConditions + clusterv1.Convert_int32_To_Pointer_int32(src.Status.ExpectedMachines, ok, restored.Status.ExpectedMachines, &dst.Status.ExpectedMachines) clusterv1.Convert_int32_To_Pointer_int32(src.Status.CurrentHealthy, ok, restored.Status.CurrentHealthy, &dst.Status.CurrentHealthy) clusterv1.Convert_int32_To_Pointer_int32(src.Status.RemediationsAllowed, ok, restored.Status.RemediationsAllowed, &dst.Status.RemediationsAllowed) diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go b/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go index 47b39e1b742e..9eabc9819e90 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go @@ -1394,6 +1394,7 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { { Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, Status: metav1.ConditionFalse, + Reason: "EtcdPodUnhealthy", LastTransitionTime: metav1.NewTime(time.Now().Add(-10 * time.Minute)), }, } @@ -2880,7 +2881,7 @@ func newMachineHealthCheck(namespace, clusterName string) *clusterv1.MachineHeal }, UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ { - Type: clusterv1.MachineOwnerRemediatedCondition, + Type: clusterv1.MachineReadyCondition, Status: metav1.ConditionUnknown, TimeoutSeconds: ptr.To(int32(5 * 60)), }, diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go index 2cd9742f39ae..07edf25be0f8 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go @@ -196,7 +196,7 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second if nodeCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) { - v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.UnhealthyNodeConditionV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Condition %s on node is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()) + v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.UnhealthyNodeConditionV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Condition %s on Node is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()) logger.V(3).Info("Target is unhealthy: condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String()) conditions.Set(t.Machine, metav1.Condition{ @@ -230,7 +230,7 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second if machineCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) { - v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.UnhealthyMachineConditionV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Condition %s on machine is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()) + v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.UnhealthyMachineConditionV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Condition %s on Machine is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()) logger.V(3).Info("Target is unhealthy: condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String()) conditions.Set(t.Machine, metav1.Condition{ diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go index aae1a1f6fedd..daba75a0aa65 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go @@ -391,7 +391,7 @@ func TestHealthCheckTargets(t *testing.T) { nodeMissing: false, } nodeUnknown400Condition := newFailedHealthCheckV1Beta1Condition(clusterv1.UnhealthyNodeConditionV1Beta1Reason, "Condition Ready on node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String()) - nodeUnknown400V1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckUnhealthyNodeReason, "Health check failed: Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyMachineConditions) * time.Second).String()) + nodeUnknown400V1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckUnhealthyNodeReason, "Health check failed: Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String()) // Target for when a node is healthy testNodeHealthy := newTestNode("node1") diff --git a/internal/webhooks/clusterclass_test.go b/internal/webhooks/clusterclass_test.go index ab709b0b5a21..24c70dad8f41 100644 --- a/internal/webhooks/clusterclass_test.go +++ b/internal/webhooks/clusterclass_test.go @@ -808,6 +808,7 @@ func TestClusterClassValidation(t *testing.T) { TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, + // nodeStartupTimeout is too short here. UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ { Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, diff --git a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml index 74c4561765ac..3bd890e2f495 100644 --- a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml +++ b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-remediation/mhc.yaml @@ -3,7 +3,6 @@ # - a selector that targets all the machines with label cluster.x-k8s.io/control-plane="" and the mhc-test: "fail" (the label is used to trigger remediation in a controlled way - by adding CP under MHC control intentionally -) # - nodeStartupTimeoutSeconds: 30s (to force remediation on nodes still provisioning) # - unhealthyNodeConditions triggering remediation after 10s the e2e.remediation.condition condition is set to false (to force remediation on nodes already provisioned) -# - unhealthyMachineConditions triggering remediation after 10s the condition is set to false (to force remediation on machines already provisioned) apiVersion: cluster.x-k8s.io/v1beta2 kind: MachineHealthCheck metadata: @@ -20,10 +19,6 @@ spec: - type: e2e.remediation.condition status: "False" timeoutSeconds: 10 - unhealthyMachineConditions: - - type: "Ready" - status: "False" - timeoutSeconds: 10 remediation: triggerIf: unhealthyLessThanOrEqualTo: 100% diff --git a/test/e2e/data/infrastructure-docker/main/cluster-template-md-remediation/mhc.yaml b/test/e2e/data/infrastructure-docker/main/cluster-template-md-remediation/mhc.yaml index f5c9736b06a2..1c65f920dc68 100644 --- a/test/e2e/data/infrastructure-docker/main/cluster-template-md-remediation/mhc.yaml +++ b/test/e2e/data/infrastructure-docker/main/cluster-template-md-remediation/mhc.yaml @@ -1,7 +1,7 @@ --- # MachineHealthCheck object with # - a selector that targets all the machines with label e2e.remediation.label="" -# - unhealthyNodeConditions and unhealthyMachineConditions triggering remediation after 10s the condition is set +# - unhealthyNodeConditions triggering remediation after 10s the condition is set apiVersion: cluster.x-k8s.io/v1beta2 kind: MachineHealthCheck metadata: @@ -16,10 +16,6 @@ spec: - type: e2e.remediation.condition status: "False" timeoutSeconds: 10 - unhealthyMachineConditions: - - type: "Ready" - status: "False" - timeoutSeconds: 10 remediation: triggerIf: unhealthyLessThanOrEqualTo: 100% diff --git a/test/e2e/data/infrastructure-docker/main/clusterclass-quick-start.yaml b/test/e2e/data/infrastructure-docker/main/clusterclass-quick-start.yaml index 70bf9231d2b2..6a4d8f9b5e26 100644 --- a/test/e2e/data/infrastructure-docker/main/clusterclass-quick-start.yaml +++ b/test/e2e/data/infrastructure-docker/main/clusterclass-quick-start.yaml @@ -25,9 +25,9 @@ spec: status: "False" timeoutSeconds: 20 unhealthyMachineConditions: - - type: "Ready" + - type: e2e.clusterclass.condition status: "False" - timeoutSeconds: 20 + timeoutSeconds: 300 infrastructure: templateRef: apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 diff --git a/test/e2e/data/infrastructure-docker/v1.11/clusterclass-quick-start.yaml b/test/e2e/data/infrastructure-docker/v1.11/clusterclass-quick-start.yaml index e747a5918951..48b41b1508d2 100644 --- a/test/e2e/data/infrastructure-docker/v1.11/clusterclass-quick-start.yaml +++ b/test/e2e/data/infrastructure-docker/v1.11/clusterclass-quick-start.yaml @@ -24,10 +24,6 @@ spec: - type: e2e.remediation.condition status: "False" timeoutSeconds: 20 - unhealthyMachineConditions: - - type: "Ready" - status: "False" - timeoutSeconds: 20 infrastructure: templateRef: apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 @@ -51,7 +47,7 @@ spec: apiVersion: infrastructure.cluster.x-k8s.io/v1beta2 kind: DockerMachineTemplate name: quick-start-default-worker-machinetemplate - # We are intentionally not setting the 'unhealthyNodeConditions and unhealthyMachineConditions' here to test that those fields are optional. + # We are intentionally not setting the 'unhealthyNodeConditions' here to test that the field is optional. machinePools: - class: default-worker metadata: diff --git a/test/e2e/kcp_remediations.go b/test/e2e/kcp_remediations.go index 0edccd668bad..fa80f9cbd871 100644 --- a/test/e2e/kcp_remediations.go +++ b/test/e2e/kcp_remediations.go @@ -79,10 +79,6 @@ type KCPRemediationSpecInput struct { // - type: e2e.remediation.condition // status: "False" // timeout: 10s - // unhealthyMachineConditions: - // - type: "Ready" - // status: "False" - // timeoutSeconds: 10 // If not specified, "kcp-remediation" is used. Flavor *string diff --git a/test/framework/machine_helpers.go b/test/framework/machine_helpers.go index f3018594db9d..75d3364417ab 100644 --- a/test/framework/machine_helpers.go +++ b/test/framework/machine_helpers.go @@ -22,7 +22,6 @@ import ( . "github.com/onsi/gomega" "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/client" @@ -252,48 +251,6 @@ func PatchNodeCondition(ctx context.Context, input PatchNodeConditionInput) { }, retryableOperationTimeout, retryableOperationInterval).Should(Succeed(), "Failed to patch node %s", input.Machine.Status.NodeRef.Name) } -type PatchMachineConditionInput struct { - ClusterProxy ClusterProxy - Cluster *clusterv1.Cluster - Machine *clusterv1.Machine - MachineCondition clusterv1.Condition -} - -// PatchMachineCondition patches a Machine resource's status with a given condition -// to simulate an unhealthy machine for MHC testing. -func PatchMachineCondition(ctx context.Context, input PatchMachineConditionInput) { - Expect(ctx).NotTo(BeNil(), "ctx is required for PatchMachineCondition") - Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling PatchMachineCondition") - Expect(input.Cluster).ToNot(BeNil(), "Invalid argument. input.Cluster can't be nil when calling PatchMachineCondition") - Expect(input.Machine).ToNot(BeNil(), "Invalid argument. input.Machine can't be nil when calling PatchMachineCondition") - - machine := &clusterv1.Machine{} - Eventually(func() error { - return input.ClusterProxy.GetClient().Get(ctx, types.NamespacedName{ - Namespace: input.Machine.Namespace, - Name: input.Machine.Name, - }, machine) - }, retryableOperationTimeout, retryableOperationInterval).Should(Succeed(), "Failed to get Machine %s", input.Machine.Name) - - patchHelper, err := patch.NewHelper(machine, input.ClusterProxy.GetClient()) - Expect(err).ToNot(HaveOccurred()) - - // Convert clusterv1.Condition -> metav1.Condition - cond := metav1.Condition{ - Type: string(input.MachineCondition.Type), - Status: metav1.ConditionStatus(input.MachineCondition.Status), - LastTransitionTime: input.MachineCondition.LastTransitionTime, - Reason: input.MachineCondition.Reason, - Message: input.MachineCondition.Message, - } - - machine.Status.Conditions = append(machine.Status.Conditions, cond) - - Eventually(func() error { - return patchHelper.Patch(ctx, machine) - }, retryableOperationTimeout, retryableOperationInterval).Should(Succeed(), "Failed to patch Machine %s", input.Machine.Name) -} - // MachineStatusCheck is a type that operates a status check on a Machine. type MachineStatusCheck func(p *clusterv1.Machine) error diff --git a/test/framework/machinehealthcheck_helpers.go b/test/framework/machinehealthcheck_helpers.go index 35622708f9db..1a147d4afcc6 100644 --- a/test/framework/machinehealthcheck_helpers.go +++ b/test/framework/machinehealthcheck_helpers.go @@ -57,7 +57,6 @@ func DiscoverMachineHealthChecksAndWaitForRemediation(ctx context.Context, input for _, mhc := range machineHealthChecks { Expect(mhc.Spec.Checks.UnhealthyNodeConditions).NotTo(BeEmpty()) - Expect(mhc.Spec.Checks.UnhealthyMachineConditions).NotTo(BeEmpty()) fmt.Fprintln(GinkgoWriter, "Ensuring there is at least 1 Machine that MachineHealthCheck is matching") machines := GetMachinesByMachineHealthCheck(ctx, GetMachinesByMachineHealthCheckInput{ @@ -81,21 +80,6 @@ func DiscoverMachineHealthChecksAndWaitForRemediation(ctx context.Context, input Machine: machines[0], }) - fmt.Fprintln(GinkgoWriter, "Patching MachineHealthCheck unhealthy machine condition to one of the machines") - unhealthyMachineCondition := clusterv1.Condition{ - Type: clusterv1.ConditionType(mhc.Spec.Checks.UnhealthyMachineConditions[0].Type), - Status: corev1.ConditionStatus(mhc.Spec.Checks.UnhealthyMachineConditions[0].Status), - - LastTransitionTime: metav1.Time{Time: time.Now()}, - } - - PatchMachineCondition(ctx, PatchMachineConditionInput{ - ClusterProxy: input.ClusterProxy, - Cluster: input.Cluster, - Machine: machines[0].DeepCopy(), - MachineCondition: unhealthyMachineCondition, - }) - fmt.Fprintln(GinkgoWriter, "Waiting for remediation") WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition(ctx, WaitForMachineHealthCheckToRemediateUnhealthyNodeConditionInput{ ClusterProxy: input.ClusterProxy, @@ -103,14 +87,6 @@ func DiscoverMachineHealthChecksAndWaitForRemediation(ctx context.Context, input MachineHealthCheck: mhc, MachinesCount: len(machines), }, input.WaitForMachineRemediation...) - - fmt.Fprintln(GinkgoWriter, "Waiting for remediation of unhealthy machine condition") - WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition(ctx, WaitForMachineHealthCheckToRemediateUnhealthyNodeConditionInput{ - ClusterProxy: input.ClusterProxy, - Cluster: input.Cluster, - MachineHealthCheck: mhc, - MachinesCount: len(machines), - }, input.WaitForMachineRemediation...) } } @@ -192,35 +168,6 @@ func WaitForMachineHealthCheckToRemediateUnhealthyNodeCondition(ctx context.Cont }, intervals...).Should(BeTrue()) } -// WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition patches a machine condition to simulate unhealthiness and waits for remediation. -func WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition(ctx context.Context, input WaitForMachineHealthCheckToRemediateUnhealthyNodeConditionInput, intervals ...interface{}) { - Expect(ctx).NotTo(BeNil(), "ctx is required for WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition") - Expect(input.ClusterProxy).ToNot(BeNil(), "Invalid argument. input.ClusterProxy can't be nil when calling WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition") - Expect(input.Cluster).ToNot(BeNil(), "Invalid argument. input.Cluster can't be nil when calling WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition") - Expect(input.MachineHealthCheck).NotTo(BeNil(), "Invalid argument. input.MachineHealthCheck can't be nil when calling WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition") - Expect(input.MachinesCount).NotTo(BeZero(), "Invalid argument. input.MachinesCount can't be zero when calling WaitForMachineHealthCheckToRemediateUnhealthyMachineCondition") - - fmt.Fprintln(GinkgoWriter, "Waiting until the machine with unhealthy machine condition is remediated") - - Eventually(func() bool { - machines := GetMachinesByMachineHealthCheck(ctx, GetMachinesByMachineHealthCheckInput{ - Lister: input.ClusterProxy.GetClient(), - ClusterName: input.Cluster.Name, - MachineHealthCheck: input.MachineHealthCheck, - }) - if len(machines) < input.MachinesCount { - return false - } - - for _, machine := range machines { - if hasMatchingUnhealthyMachineConditions(input.MachineHealthCheck, machine.Status.Conditions) { - return false - } - } - return true - }, intervals...).Should(BeTrue()) -} - // hasMatchingUnhealthyNodeConditions returns true if any node condition matches with machine health check unhealthy conditions. func hasMatchingUnhealthyNodeConditions(machineHealthCheck *clusterv1.MachineHealthCheck, nodeConditions []corev1.NodeCondition) bool { for _, unhealthyNodeCondition := range machineHealthCheck.Spec.Checks.UnhealthyNodeConditions { @@ -232,15 +179,3 @@ func hasMatchingUnhealthyNodeConditions(machineHealthCheck *clusterv1.MachineHea } return false } - -func hasMatchingUnhealthyMachineConditions(machineHealthCheck *clusterv1.MachineHealthCheck, machineConditions []metav1.Condition) bool { - for _, unhealthyMachineCondition := range machineHealthCheck.Spec.Checks.UnhealthyMachineConditions { - for _, machineCondition := range machineConditions { - if machineCondition.Type == unhealthyMachineCondition.Type && - machineCondition.Status == unhealthyMachineCondition.Status { - return true - } - } - } - return false -} From f96b74299588d4eba1791d9427f33feb599fad47 Mon Sep 17 00:00:00 2001 From: Furkat Gofurov Date: Mon, 13 Oct 2025 22:55:31 +0300 Subject: [PATCH 4/8] Address review comments: rework node and machine checks in needsRemediation() method If both a node condition and machine condition are unhealthy, pick one reason but combine all the messages Signed-off-by: Furkat Gofurov --- .../machinehealthcheck_targets.go | 88 ++++++++++++++----- .../machinehealthcheck_targets_test.go | 4 +- internal/webhooks/machinehealthcheck_test.go | 2 +- 3 files changed, 69 insertions(+), 25 deletions(-) diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go index 07edf25be0f8..d26a5be5eca6 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go @@ -19,6 +19,7 @@ package machinehealthcheck import ( "context" "fmt" + "strings" "time" "github.com/go-logr/logr" @@ -181,6 +182,13 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi return false, nextCheck } + // Collect all unhealthy conditions (both node and machine) to provide comprehensive status + var ( + unhealthyMessages []string + unhealthyReasons []string + foundUnhealthyCondition bool + ) + // check node conditions for _, c := range t.MHC.Spec.Checks.UnhealthyNodeConditions { nodeCondition := getNodeCondition(t.Node, c.Type) @@ -192,20 +200,15 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi } // If the node condition has been in the unhealthy state for longer than the - // timeout, return true with no requeue time. + // timeout, mark as unhealthy and collect the message. timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second if nodeCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) { - v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.UnhealthyNodeConditionV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Condition %s on Node is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()) - logger.V(3).Info("Target is unhealthy: condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String()) - - conditions.Set(t.Machine, metav1.Condition{ - Type: clusterv1.MachineHealthCheckSucceededCondition, - Status: metav1.ConditionFalse, - Reason: clusterv1.MachineHealthCheckUnhealthyNodeReason, - Message: fmt.Sprintf("Health check failed: Condition %s on Node is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()), - }) - return true, time.Duration(0) + foundUnhealthyCondition = true + unhealthyMessages = append(unhealthyMessages, fmt.Sprintf("Node condition %s is %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String())) + unhealthyReasons = append(unhealthyReasons, "UnhealthyNode") + logger.V(3).Info("Target is unhealthy: node condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String()) + continue } durationUnhealthy := now.Sub(nodeCondition.LastTransitionTime.Time) @@ -226,20 +229,15 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi } // If the machine condition has been in the unhealthy state for longer than the - // timeout, return true with no requeue time. + // timeout, mark as unhealthy and collect the message. timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second if machineCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) { - v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.UnhealthyMachineConditionV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Condition %s on Machine is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()) - logger.V(3).Info("Target is unhealthy: condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String()) - - conditions.Set(t.Machine, metav1.Condition{ - Type: clusterv1.MachineHealthCheckSucceededCondition, - Status: metav1.ConditionFalse, - Reason: clusterv1.MachineHealthCheckUnhealthyMachineReason, - Message: fmt.Sprintf("Health check failed: Condition %s on Machine is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String()), - }) - return true, time.Duration(0) + foundUnhealthyCondition = true + unhealthyMessages = append(unhealthyMessages, fmt.Sprintf("Machine condition %s is %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String())) + unhealthyReasons = append(unhealthyReasons, "UnhealthyMachine") + logger.V(3).Info("Target is unhealthy: machine condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String()) + continue } durationUnhealthy := now.Sub(machineCondition.LastTransitionTime.Time) @@ -249,6 +247,52 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi } } + // If any unhealthy conditions were found, set the combined status + if foundUnhealthyCondition { + // Determine the primary reason based on a consistent priority order: + // 1. If both node and machine conditions are present, use a combined reason + // 2. Otherwise use the specific reason for the type that failed + var primaryReason, v1beta1Reason string + if len(unhealthyReasons) > 0 { + // Check if we have both node and machine reasons + hasNodeReason := false + hasMachineReason := false + for _, reason := range unhealthyReasons { + switch reason { + case "UnhealthyNode": + hasNodeReason = true + case "UnhealthyMachine": + hasMachineReason = true + } + } + + if hasNodeReason && hasMachineReason { + // Both types of conditions are unhealthy - use machine reason but indicate it's combined + primaryReason = clusterv1.MachineHealthCheckUnhealthyMachineReason + v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason + } else if hasMachineReason { + primaryReason = clusterv1.MachineHealthCheckUnhealthyMachineReason + v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason + } else if hasNodeReason { + primaryReason = clusterv1.MachineHealthCheckUnhealthyNodeReason + v1beta1Reason = clusterv1.UnhealthyNodeConditionV1Beta1Reason + } + } + + // Combine all messages into a single comprehensive message + combinedMessage := fmt.Sprintf("Health check failed: %s", strings.Join(unhealthyMessages, "; ")) + + v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", combinedMessage) + + conditions.Set(t.Machine, metav1.Condition{ + Type: clusterv1.MachineHealthCheckSucceededCondition, + Status: metav1.ConditionFalse, + Reason: primaryReason, + Message: combinedMessage, + }) + return true, time.Duration(0) + } + return false, minDuration(nextCheckTimes) } diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go index daba75a0aa65..16a4029dcaa0 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go @@ -390,7 +390,7 @@ func TestHealthCheckTargets(t *testing.T) { Node: testNodeUnknown400, nodeMissing: false, } - nodeUnknown400Condition := newFailedHealthCheckV1Beta1Condition(clusterv1.UnhealthyNodeConditionV1Beta1Reason, "Condition Ready on node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String()) + nodeUnknown400Condition := newFailedHealthCheckV1Beta1Condition(clusterv1.UnhealthyNodeConditionV1Beta1Reason, "Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String()) nodeUnknown400V1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckUnhealthyNodeReason, "Health check failed: Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String()) // Target for when a node is healthy @@ -425,7 +425,7 @@ func TestHealthCheckTargets(t *testing.T) { } machineUnhealthy400Condition := newFailedHealthCheckV1Beta1Condition( clusterv1.UnhealthyMachineConditionV1Beta1Reason, - "Condition EtcdPodHealthy on machine is reporting status False for more than %s", + "Condition EtcdPodHealthy on Machine is reporting status False for more than %s", (time.Duration(timeoutForUnhealthyMachineConditions) * time.Second).String(), ) machineUnhealthy400V1Beta2Condition := newFailedHealthCheckCondition( diff --git a/internal/webhooks/machinehealthcheck_test.go b/internal/webhooks/machinehealthcheck_test.go index 35d7d598a738..5101d69820f6 100644 --- a/internal/webhooks/machinehealthcheck_test.go +++ b/internal/webhooks/machinehealthcheck_test.go @@ -301,7 +301,7 @@ func TestMachineHealthCheckUnhealthyMachineConditions(t *testing.T) { expectErr: false, }, { - name: "do not fail if the UnhealthyMachineCondition array is nil", + name: "do not fail if the UnhealthyMachineCondition array is empty", unhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{}, expectErr: false, }, From d80791188128e3165922fb409f6db116005d118c Mon Sep 17 00:00:00 2001 From: Furkat Gofurov Date: Fri, 17 Oct 2025 08:11:29 +0300 Subject: [PATCH 5/8] Address Stefan comments (conversion) Signed-off-by: Furkat Gofurov --- internal/api/core/v1alpha3/conversion.go | 7 ------- internal/api/core/v1alpha3/conversion_test.go | 10 ---------- internal/api/core/v1alpha4/conversion.go | 10 ---------- internal/api/core/v1alpha4/conversion_test.go | 10 ---------- 4 files changed, 37 deletions(-) diff --git a/internal/api/core/v1alpha3/conversion.go b/internal/api/core/v1alpha3/conversion.go index 3a7af0830444..5d6c73735258 100644 --- a/internal/api/core/v1alpha3/conversion.go +++ b/internal/api/core/v1alpha3/conversion.go @@ -98,13 +98,6 @@ func (src *Cluster) ConvertTo(dstRaw conversion.Hub) error { if err != nil { return err } - // Recover unhealthyMachineConditions for control plane and machine deployments - dst.Spec.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = restored.Spec.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions - for i, md := range restored.Spec.Topology.Workers.MachineDeployments { - if i < len(dst.Spec.Topology.Workers.MachineDeployments) { - dst.Spec.Topology.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = md.HealthCheck.Checks.UnhealthyMachineConditions - } - } // Recover intent for bool values converted to *bool. clusterv1.Convert_bool_To_Pointer_bool(src.Spec.Paused, ok, restored.Spec.Paused, &dst.Spec.Paused) diff --git a/internal/api/core/v1alpha3/conversion_test.go b/internal/api/core/v1alpha3/conversion_test.go index 4bac92df62b5..49ee38de30bb 100644 --- a/internal/api/core/v1alpha3/conversion_test.go +++ b/internal/api/core/v1alpha3/conversion_test.go @@ -417,7 +417,6 @@ func spokeCluster(in *Cluster, c randfill.Continue) { func MachineHealthCheckFuzzFunc(_ runtimeserializer.CodecFactory) []interface{} { return []interface{}{ hubMachineHealthCheckStatus, - hubMachineHealthCheckSpec, hubUnhealthyNodeCondition, spokeMachineHealthCheck, spokeMachineHealthCheckSpec, @@ -426,15 +425,6 @@ func MachineHealthCheckFuzzFunc(_ runtimeserializer.CodecFactory) []interface{} } } -func hubMachineHealthCheckSpec(in *clusterv1.MachineHealthCheckSpec, c randfill.Continue) { - c.FillNoCustom(in) - - // Drop UnhealthyMachineConditions as it does not exist in v1alpha3. - if in.Checks.UnhealthyMachineConditions != nil { - in.Checks.UnhealthyMachineConditions = nil - } -} - func hubMachineHealthCheckStatus(in *clusterv1.MachineHealthCheckStatus, c randfill.Continue) { c.FillNoCustom(in) // Drop empty structs with only omit empty fields. diff --git a/internal/api/core/v1alpha4/conversion.go b/internal/api/core/v1alpha4/conversion.go index 34177a9a6027..cc921319b13b 100644 --- a/internal/api/core/v1alpha4/conversion.go +++ b/internal/api/core/v1alpha4/conversion.go @@ -92,11 +92,6 @@ func (src *Cluster) ConvertTo(dstRaw conversion.Hub) error { return err } - dst.Spec.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = restored.Spec.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions - for i, md := range restored.Spec.Topology.Workers.MachineDeployments { - dst.Spec.Topology.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = md.HealthCheck.Checks.UnhealthyMachineConditions - } - // Recover intent for bool values converted to *bool. clusterv1.Convert_bool_To_Pointer_bool(src.Spec.Paused, ok, restored.Spec.Paused, &dst.Spec.Paused) @@ -242,11 +237,6 @@ func (src *ClusterClass) ConvertTo(dstRaw conversion.Hub) error { return err } - dst.Spec.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = restored.Spec.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions - for i, md := range restored.Spec.Workers.MachineDeployments { - dst.Spec.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = md.HealthCheck.Checks.UnhealthyMachineConditions - } - dst.Spec.Patches = restored.Spec.Patches dst.Spec.Variables = restored.Spec.Variables dst.Spec.AvailabilityGates = restored.Spec.AvailabilityGates diff --git a/internal/api/core/v1alpha4/conversion_test.go b/internal/api/core/v1alpha4/conversion_test.go index 8d6ca8f84b6b..126009a1d891 100644 --- a/internal/api/core/v1alpha4/conversion_test.go +++ b/internal/api/core/v1alpha4/conversion_test.go @@ -486,7 +486,6 @@ func spokeMachineDeploymentSpec(in *MachineDeploymentSpec, c randfill.Continue) func MachineHealthCheckFuzzFunc(_ runtimeserializer.CodecFactory) []interface{} { return []interface{}{ hubMachineHealthCheckStatus, - hubMachineHealthCheckSpec, hubUnhealthyNodeCondition, spokeMachineHealthCheck, spokeMachineHealthCheckSpec, @@ -495,15 +494,6 @@ func MachineHealthCheckFuzzFunc(_ runtimeserializer.CodecFactory) []interface{} } } -func hubMachineHealthCheckSpec(in *clusterv1.MachineHealthCheckSpec, c randfill.Continue) { - c.FillNoCustom(in) - - // Drop UnhealthyMachineConditions as it does not exist in v1alpha4. - if in.Checks.UnhealthyMachineConditions != nil { - in.Checks.UnhealthyMachineConditions = nil - } -} - func hubMachineHealthCheckStatus(in *clusterv1.MachineHealthCheckStatus, c randfill.Continue) { c.FillNoCustom(in) // Drop empty structs with only omit empty fields. From a8266e7ac5f6a6bbadcee7222dc861d5b4891050 Mon Sep 17 00:00:00 2001 From: Furkat Gofurov Date: Fri, 17 Oct 2025 08:41:56 +0300 Subject: [PATCH 6/8] Address review comments Fabrizio (mhc target, mhc controller code) Refactors `needsRemediation`, specifically following changes were made: - Move machine condition evaluation to always execute first, regardless of node state - Ensure machine conditions are checked in ALL scenarios: * When node is missing (t.nodeMissing) * When node hasn't appeared yet (t.Node == nil) * When node exists (t.Node != nil) - Consistently merge node and machine condition messages in all failure scenarios - Maintain backward compatibility with existing condition message formats - Use appropriate condition reasons based on which conditions are unhealthy Signed-off-by: Furkat Gofurov --- .../machinehealthcheck_controller_test.go | 16 +- .../machinehealthcheck_targets.go | 247 ++++++++++-------- 2 files changed, 153 insertions(+), 110 deletions(-) diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go b/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go index 9eabc9819e90..6ae33a9442fa 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go @@ -1332,6 +1332,15 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { mhc := newMachineHealthCheck(cluster.Namespace, cluster.Name) + // Add UnhealthyMachineConditions to test machine condition evaluation + mhc.Spec.Checks.UnhealthyMachineConditions = []clusterv1.UnhealthyMachineCondition{ + { + Type: controlplanev1.KubeadmControlPlaneMachineEtcdPodHealthyCondition, + Status: metav1.ConditionFalse, + TimeoutSeconds: ptr.To(int32(5 * 60)), + }, + } + g.Expect(env.Create(ctx, mhc)).To(Succeed()) defer func(do ...client.Object) { g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) @@ -2879,13 +2888,6 @@ func newMachineHealthCheck(namespace, clusterName string) *clusterv1.MachineHeal TimeoutSeconds: ptr.To(int32(5 * 60)), }, }, - UnhealthyMachineConditions: []clusterv1.UnhealthyMachineCondition{ - { - Type: clusterv1.MachineReadyCondition, - Status: metav1.ConditionUnknown, - TimeoutSeconds: ptr.To(int32(5 * 60)), - }, - }, }, Remediation: clusterv1.MachineHealthCheckRemediation{ TriggerIf: clusterv1.MachineHealthCheckRemediationTriggerIf{ diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go index d26a5be5eca6..61a252c4dd9b 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go @@ -74,16 +74,20 @@ func (t *healthCheckTarget) nodeName() string { return "" } -// Determine whether or not a given target needs remediation. -// The node will need remediation if any of the following are true: +// needsRemediation determines whether a given target needs remediation. +// The machine will need remediation if any of the following are true: // - The Machine has the remediate machine annotation -// - The Machine has failed for some reason +// - Any condition on the machine matches the configured checks and exceeds the timeout // - The Machine did not get a node before `timeoutForMachineToHaveNode` elapses -// - The Node has gone away -// - Any condition on the node is matched for the given timeout -// If the target doesn't currently need rememdiation, provide a duration after -// which the target should next be checked. -// The target should be requeued after this duration. +// - The Node has been deleted but the Machine still references it +// - Any condition on the node matches the configured checks and exceeds the timeout +// +// Machine conditions are always evaluated first and consistently across all scenarios +// (node missing, node startup timeout, node exists) to ensure comprehensive health checking. +// When multiple issues are detected, error messages are merged to provide complete visibility. +// +// Returns true if remediation is needed, and a duration indicating when to recheck if remediation +// is not immediately required. The target should be requeued after this duration. func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) (bool, time.Duration) { var nextCheckTimes []time.Duration now := time.Now() @@ -101,20 +105,6 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi return true, time.Duration(0) } - // Machine has Status.NodeRef set, although we couldn't find the node in the workload cluster. - if t.nodeMissing { - logger.V(3).Info("Target is unhealthy: node is missing") - v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.NodeNotFoundV1Beta1Reason, clusterv1.ConditionSeverityWarning, "") - - conditions.Set(t.Machine, metav1.Condition{ - Type: clusterv1.MachineHealthCheckSucceededCondition, - Status: metav1.ConditionFalse, - Reason: clusterv1.MachineHealthCheckNodeDeletedReason, - Message: fmt.Sprintf("Health check failed: Node %s has been deleted", t.Machine.Status.NodeRef.Name), - }) - return true, time.Duration(0) - } - // Don't penalize any Machine/Node if the control plane has not been initialized // Exception of this rule are control plane machine itself, so the first control plane machine can be remediated. if !conditions.IsTrue(t.Cluster, clusterv1.ClusterControlPlaneInitializedCondition) && !util.IsControlPlaneMachine(t.Machine) { @@ -130,12 +120,99 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi return false, 0 } + // Collect all unhealthy conditions (both node and machine) to provide comprehensive status + // Always evaluate machine conditions regardless of node state for consistent behavior + var ( + unhealthyNodeMessages []string + unhealthyMachineMessages []string + ) + + // Always check machine conditions first, regardless of node state + for _, c := range t.MHC.Spec.Checks.UnhealthyMachineConditions { + machineCondition := getMachineCondition(t.Machine, c.Type) + + // Skip when current machine condition is different from the one reported + // in the MachineHealthCheck. + if machineCondition == nil || machineCondition.Status != c.Status { + continue + } + + // If the machine condition has been in the unhealthy state for longer than the + // timeout, mark as unhealthy and collect the message. + timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second + + if machineCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) { + unhealthyMachineMessages = append(unhealthyMachineMessages, fmt.Sprintf("Condition %s on Machine is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String())) + logger.V(3).Info("Target is unhealthy: machine condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String()) + continue + } + + durationUnhealthy := now.Sub(machineCondition.LastTransitionTime.Time) + nextCheck := timeoutSecondsDuration - durationUnhealthy + time.Second + if nextCheck > 0 { + nextCheckTimes = append(nextCheckTimes, nextCheck) + } + } + + // Machine has Status.NodeRef set, although we couldn't find the node in the workload cluster. + if t.nodeMissing { + logger.V(3).Info("Target is unhealthy: node is missing") + + // Always merge node missing message with any unhealthy machine conditions + nodeMissingMessage := fmt.Sprintf("Node %s has been deleted", t.Machine.Status.NodeRef.Name) + allMessages := append([]string{nodeMissingMessage}, unhealthyMachineMessages...) + + reason := clusterv1.MachineHealthCheckNodeDeletedReason + v1beta1Reason := clusterv1.NodeNotFoundV1Beta1Reason + if len(unhealthyMachineMessages) > 0 { + reason = clusterv1.MachineHealthCheckUnhealthyMachineReason + v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason + } + + // For v1beta2 we use a single-line message prefixed with "Health check failed: " + conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; ")) + conditions.Set(t.Machine, metav1.Condition{ + Type: clusterv1.MachineHealthCheckSucceededCondition, + Status: metav1.ConditionFalse, + Reason: reason, + Message: conditionMessage, + }) + + // For v1beta1 keep the existing format + if len(unhealthyMachineMessages) > 0 { + v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; ")) + } else { + v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "") + } + return true, time.Duration(0) + } + // the node has not been set yet if t.Node == nil { + // Check if we already have unhealthy machine conditions that should trigger remediation + if len(unhealthyMachineMessages) > 0 { + reason := clusterv1.MachineHealthCheckUnhealthyMachineReason + v1beta1Reason := clusterv1.UnhealthyMachineConditionV1Beta1Reason + + // For v1beta2 we use a single-line message prefixed with "Health check failed: " + conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(unhealthyMachineMessages, "; ")) + conditions.Set(t.Machine, metav1.Condition{ + Type: clusterv1.MachineHealthCheckSucceededCondition, + Status: metav1.ConditionFalse, + Reason: reason, + Message: conditionMessage, + }) + + // For v1beta1 keep the existing semicolon-separated format (no prefix). + v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(unhealthyMachineMessages, "; ")) + + return true, time.Duration(0) + } + if timeoutForMachineToHaveNode == disabledNodeStartupTimeout { // Startup timeout is disabled so no need to go any further. // No node yet to check conditions, can return early here. - return false, 0 + return false, minDuration(nextCheckTimes) } controlPlaneInitialized := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ClusterControlPlaneInitializedCondition) @@ -164,32 +241,47 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi timeoutDuration := timeoutForMachineToHaveNode.Duration if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) { - v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.NodeStartupTimeoutV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutDuration) + // Node startup timeout - merge with any unhealthy machine conditions + nodeTimeoutMessage := fmt.Sprintf("Node failed to report startup in %s", timeoutDuration) + allMessages := append([]string{nodeTimeoutMessage}, unhealthyMachineMessages...) + + reason := clusterv1.MachineHealthCheckNodeStartupTimeoutReason + v1beta1Reason := clusterv1.NodeStartupTimeoutV1Beta1Reason + if len(unhealthyMachineMessages) > 0 { + reason = clusterv1.MachineHealthCheckUnhealthyMachineReason + v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason + } + logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutDuration) + // For v1beta2 we use a single-line message prefixed with "Health check failed: " + conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; ")) conditions.Set(t.Machine, metav1.Condition{ Type: clusterv1.MachineHealthCheckSucceededCondition, Status: metav1.ConditionFalse, - Reason: clusterv1.MachineHealthCheckNodeStartupTimeoutReason, - Message: fmt.Sprintf("Health check failed: Node failed to report startup in %s", timeoutDuration), + Reason: reason, + Message: conditionMessage, }) + + // For v1beta1 keep the existing format + if len(unhealthyMachineMessages) > 0 { + v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; ")) + } else { + v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutDuration) + } return true, time.Duration(0) } durationUnhealthy := now.Sub(comparisonTime) nextCheck := timeoutDuration - durationUnhealthy + time.Second + if nextCheck > 0 { + nextCheckTimes = append(nextCheckTimes, nextCheck) + } - return false, nextCheck + return false, minDuration(nextCheckTimes) } - // Collect all unhealthy conditions (both node and machine) to provide comprehensive status - var ( - unhealthyMessages []string - unhealthyReasons []string - foundUnhealthyCondition bool - ) - - // check node conditions + // check node conditions (only when node is available) for _, c := range t.MHC.Spec.Checks.UnhealthyNodeConditions { nodeCondition := getNodeCondition(t.Node, c.Type) @@ -204,9 +296,7 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second if nodeCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) { - foundUnhealthyCondition = true - unhealthyMessages = append(unhealthyMessages, fmt.Sprintf("Node condition %s is %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String())) - unhealthyReasons = append(unhealthyReasons, "UnhealthyNode") + unhealthyNodeMessages = append(unhealthyNodeMessages, fmt.Sprintf("Condition %s on Node is reporting status %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String())) logger.V(3).Info("Target is unhealthy: node condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String()) continue } @@ -218,78 +308,29 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi } } - // check machine conditions - for _, c := range t.MHC.Spec.Checks.UnhealthyMachineConditions { - machineCondition := getMachineCondition(t.Machine, c.Type) - - // Skip when current machine condition is different from the one reported - // in the MachineHealthCheck. - if machineCondition == nil || machineCondition.Status != c.Status { - continue - } - - // If the machine condition has been in the unhealthy state for longer than the - // timeout, mark as unhealthy and collect the message. - timeoutSecondsDuration := time.Duration(ptr.Deref(c.TimeoutSeconds, 0)) * time.Second - - if machineCondition.LastTransitionTime.Add(timeoutSecondsDuration).Before(now) { - foundUnhealthyCondition = true - unhealthyMessages = append(unhealthyMessages, fmt.Sprintf("Machine condition %s is %s for more than %s", c.Type, c.Status, timeoutSecondsDuration.String())) - unhealthyReasons = append(unhealthyReasons, "UnhealthyMachine") - logger.V(3).Info("Target is unhealthy: machine condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", timeoutSecondsDuration.String()) - continue - } - - durationUnhealthy := now.Sub(machineCondition.LastTransitionTime.Time) - nextCheck := timeoutSecondsDuration - durationUnhealthy + time.Second - if nextCheck > 0 { - nextCheckTimes = append(nextCheckTimes, nextCheck) - } - } - - // If any unhealthy conditions were found, set the combined status - if foundUnhealthyCondition { - // Determine the primary reason based on a consistent priority order: - // 1. If both node and machine conditions are present, use a combined reason - // 2. Otherwise use the specific reason for the type that failed - var primaryReason, v1beta1Reason string - if len(unhealthyReasons) > 0 { - // Check if we have both node and machine reasons - hasNodeReason := false - hasMachineReason := false - for _, reason := range unhealthyReasons { - switch reason { - case "UnhealthyNode": - hasNodeReason = true - case "UnhealthyMachine": - hasMachineReason = true - } - } - - if hasNodeReason && hasMachineReason { - // Both types of conditions are unhealthy - use machine reason but indicate it's combined - primaryReason = clusterv1.MachineHealthCheckUnhealthyMachineReason - v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason - } else if hasMachineReason { - primaryReason = clusterv1.MachineHealthCheckUnhealthyMachineReason - v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason - } else if hasNodeReason { - primaryReason = clusterv1.MachineHealthCheckUnhealthyNodeReason - v1beta1Reason = clusterv1.UnhealthyNodeConditionV1Beta1Reason - } + if len(unhealthyNodeMessages) > 0 || len(unhealthyMachineMessages) > 0 { + reason := clusterv1.MachineHealthCheckUnhealthyNodeReason + v1beta1Reason := clusterv1.UnhealthyNodeConditionV1Beta1Reason + if len(unhealthyMachineMessages) > 0 { + reason = clusterv1.MachineHealthCheckUnhealthyMachineReason + v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason } // Combine all messages into a single comprehensive message - combinedMessage := fmt.Sprintf("Health check failed: %s", strings.Join(unhealthyMessages, "; ")) - - v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", combinedMessage) - + allMessages := append(unhealthyNodeMessages, unhealthyMachineMessages...) + // For v1beta2 we use a single-line message prefixed with "Health check failed: " + // for compatibility with existing tests and other condition messages. + conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; ")) conditions.Set(t.Machine, metav1.Condition{ Type: clusterv1.MachineHealthCheckSucceededCondition, Status: metav1.ConditionFalse, - Reason: primaryReason, - Message: combinedMessage, + Reason: reason, + Message: conditionMessage, }) + + // For v1beta1 keep the existing semicolon-separated format (no prefix). + v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; ")) + return true, time.Duration(0) } From 2c1ca0374c287e0016a2323a587c42045878c6f0 Mon Sep 17 00:00:00 2001 From: Furkat Gofurov Date: Fri, 17 Oct 2025 12:15:12 +0300 Subject: [PATCH 7/8] Fix event message to reflect both machine and node condition checking Signed-off-by: Furkat Gofurov --- .../machinehealthcheck/machinehealthcheck_targets.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go index 61a252c4dd9b..a5c4f2de9c7a 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go @@ -45,8 +45,8 @@ const ( // EventMachineMarkedUnhealthy is emitted when machine was successfully marked as unhealthy. EventMachineMarkedUnhealthy string = "MachineMarkedUnhealthy" - // EventDetectedUnhealthy is emitted in case a node associated with a - // machine was detected unhealthy. + // EventDetectedUnhealthy is emitted in case a machine or its associated + // node was detected unhealthy. EventDetectedUnhealthy string = "DetectedUnhealthy" ) @@ -448,7 +448,7 @@ func (r *Reconciler) healthCheckTargets(targets []healthCheckTarget, logger logr t.Machine, corev1.EventTypeNormal, EventDetectedUnhealthy, - "Machine %s has unhealthy Node %s", + "Machine %s (Node %s) is failing machine health check rules and it is likely to go unhealthy", klog.KObj(t.Machine), t.nodeName(), ) From c7f3c12df7d3a217e28b177c4d539ff159f46eab Mon Sep 17 00:00:00 2001 From: Furkat Gofurov Date: Sat, 18 Oct 2025 01:14:33 +0300 Subject: [PATCH 8/8] Simplify `needsRemediation` function further by using two sub functions: one for machineChecks and the other for nodeChecks. Another benefit of this code struct, is that condition management is implemented only in one place. Co-authored-by: Fabrizio Pandini Signed-off-by: Furkat Gofurov --- .../machinehealthcheck_targets.go | 212 +++++++----------- .../machinehealthcheck_targets_test.go | 8 +- 2 files changed, 84 insertions(+), 136 deletions(-) diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go index a5c4f2de9c7a..3b199faea769 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go @@ -66,14 +66,6 @@ type healthCheckTarget struct { nodeMissing bool } -// Get the node name if the target has a node. -func (t *healthCheckTarget) nodeName() string { - if t.Node != nil { - return t.Node.GetName() - } - return "" -} - // needsRemediation determines whether a given target needs remediation. // The machine will need remediation if any of the following are true: // - The Machine has the remediate machine annotation @@ -89,9 +81,6 @@ func (t *healthCheckTarget) nodeName() string { // Returns true if remediation is needed, and a duration indicating when to recheck if remediation // is not immediately required. The target should be requeued after this duration. func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) (bool, time.Duration) { - var nextCheckTimes []time.Duration - now := time.Now() - if annotations.HasRemediateMachine(t.Machine) { v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, clusterv1.HasRemediateMachineAnnotationV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Marked for remediation via remediate-machine annotation") logger.V(3).Info("Target is marked for remediation via remediate-machine annotation") @@ -120,12 +109,67 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi return false, 0 } - // Collect all unhealthy conditions (both node and machine) to provide comprehensive status - // Always evaluate machine conditions regardless of node state for consistent behavior - var ( - unhealthyNodeMessages []string - unhealthyMachineMessages []string - ) + // Check machine conditions + unhealthyMachineMessages, nextMachineCheck := t.machineChecks(logger) + + // Check node conditions + nodeConditionReason, nodeV1beta1ConditionReason, unhealthyNodeMessages, nextNodeCheck := t.nodeChecks(logger, timeoutForMachineToHaveNode) + + // Combine results + if len(unhealthyMachineMessages) == 0 && len(unhealthyNodeMessages) == 0 { + var nextCheckTimes []time.Duration + if nextMachineCheck > 0 { + nextCheckTimes = append(nextCheckTimes, nextMachineCheck) + } + if nextNodeCheck > 0 { + nextCheckTimes = append(nextCheckTimes, nextNodeCheck) + } + result := minDuration(nextCheckTimes) + return false, result + } + + reason := nodeConditionReason + v1beta1Reason := nodeV1beta1ConditionReason + if len(unhealthyMachineMessages) > 0 { + reason = clusterv1.MachineHealthCheckUnhealthyMachineReason + v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason + } + + // Combine all messages into a single comprehensive message + allMessages := append(unhealthyMachineMessages, unhealthyNodeMessages...) + + conditionMessage := "Health check failed:\n" + for i, m := range allMessages { + conditionMessage += fmt.Sprintf(" * %s", m) + if i != len(allMessages)-1 { + conditionMessage += "\n" + } + } + + conditions.Set(t.Machine, metav1.Condition{ + Type: clusterv1.MachineHealthCheckSucceededCondition, + Status: metav1.ConditionFalse, + Reason: reason, + Message: conditionMessage, + }) + + // v1beta1 condition message formatting: + // - For NodeNotFound (node deleted), the legacy condition historically has an empty message. + // - For all other reasons (e.g., NodeStartupTimeout, UnhealthyNodeCondition, UnhealthyMachineCondition), + // include a concise, semicolon-separated list of messages without the "Health check failed" prefix. + v1beta1Message := "" + if v1beta1Reason != clusterv1.NodeNotFoundV1Beta1Reason { + v1beta1Message = strings.Join(allMessages, "; ") + } + v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", v1beta1Message) + + return true, time.Duration(0) +} + +func (t *healthCheckTarget) machineChecks(logger logr.Logger) ([]string, time.Duration) { + var unhealthyMachineMessages []string + var nextCheckTimes []time.Duration + now := time.Now() // Always check machine conditions first, regardless of node state for _, c := range t.MHC.Spec.Checks.UnhealthyMachineConditions { @@ -154,65 +198,29 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi } } + if len(unhealthyMachineMessages) > 0 { + return unhealthyMachineMessages, time.Duration(0) + } + return nil, minDuration(nextCheckTimes) +} + +func (t *healthCheckTarget) nodeChecks(logger logr.Logger, timeoutForMachineToHaveNode metav1.Duration) (string, string, []string, time.Duration) { + var nextCheckTimes []time.Duration + now := time.Now() + // Machine has Status.NodeRef set, although we couldn't find the node in the workload cluster. if t.nodeMissing { logger.V(3).Info("Target is unhealthy: node is missing") - - // Always merge node missing message with any unhealthy machine conditions nodeMissingMessage := fmt.Sprintf("Node %s has been deleted", t.Machine.Status.NodeRef.Name) - allMessages := append([]string{nodeMissingMessage}, unhealthyMachineMessages...) - - reason := clusterv1.MachineHealthCheckNodeDeletedReason - v1beta1Reason := clusterv1.NodeNotFoundV1Beta1Reason - if len(unhealthyMachineMessages) > 0 { - reason = clusterv1.MachineHealthCheckUnhealthyMachineReason - v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason - } - - // For v1beta2 we use a single-line message prefixed with "Health check failed: " - conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; ")) - conditions.Set(t.Machine, metav1.Condition{ - Type: clusterv1.MachineHealthCheckSucceededCondition, - Status: metav1.ConditionFalse, - Reason: reason, - Message: conditionMessage, - }) - - // For v1beta1 keep the existing format - if len(unhealthyMachineMessages) > 0 { - v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; ")) - } else { - v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "") - } - return true, time.Duration(0) + return clusterv1.MachineHealthCheckNodeDeletedReason, clusterv1.NodeNotFoundV1Beta1Reason, []string{nodeMissingMessage}, time.Duration(0) } // the node has not been set yet if t.Node == nil { - // Check if we already have unhealthy machine conditions that should trigger remediation - if len(unhealthyMachineMessages) > 0 { - reason := clusterv1.MachineHealthCheckUnhealthyMachineReason - v1beta1Reason := clusterv1.UnhealthyMachineConditionV1Beta1Reason - - // For v1beta2 we use a single-line message prefixed with "Health check failed: " - conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(unhealthyMachineMessages, "; ")) - conditions.Set(t.Machine, metav1.Condition{ - Type: clusterv1.MachineHealthCheckSucceededCondition, - Status: metav1.ConditionFalse, - Reason: reason, - Message: conditionMessage, - }) - - // For v1beta1 keep the existing semicolon-separated format (no prefix). - v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(unhealthyMachineMessages, "; ")) - - return true, time.Duration(0) - } - if timeoutForMachineToHaveNode == disabledNodeStartupTimeout { // Startup timeout is disabled so no need to go any further. // No node yet to check conditions, can return early here. - return false, minDuration(nextCheckTimes) + return "", "", nil, time.Duration(0) } controlPlaneInitialized := conditions.GetLastTransitionTime(t.Cluster, clusterv1.ClusterControlPlaneInitializedCondition) @@ -241,47 +249,18 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi timeoutDuration := timeoutForMachineToHaveNode.Duration if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) { - // Node startup timeout - merge with any unhealthy machine conditions - nodeTimeoutMessage := fmt.Sprintf("Node failed to report startup in %s", timeoutDuration) - allMessages := append([]string{nodeTimeoutMessage}, unhealthyMachineMessages...) - - reason := clusterv1.MachineHealthCheckNodeStartupTimeoutReason - v1beta1Reason := clusterv1.NodeStartupTimeoutV1Beta1Reason - if len(unhealthyMachineMessages) > 0 { - reason = clusterv1.MachineHealthCheckUnhealthyMachineReason - v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason - } - logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutDuration) - - // For v1beta2 we use a single-line message prefixed with "Health check failed: " - conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; ")) - conditions.Set(t.Machine, metav1.Condition{ - Type: clusterv1.MachineHealthCheckSucceededCondition, - Status: metav1.ConditionFalse, - Reason: reason, - Message: conditionMessage, - }) - - // For v1beta1 keep the existing format - if len(unhealthyMachineMessages) > 0 { - v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; ")) - } else { - v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutDuration) - } - return true, time.Duration(0) + nodeTimeoutMessage := fmt.Sprintf("Node failed to report startup in %s", timeoutDuration) + return clusterv1.MachineHealthCheckNodeStartupTimeoutReason, clusterv1.NodeStartupTimeoutV1Beta1Reason, []string{nodeTimeoutMessage}, time.Duration(0) } durationUnhealthy := now.Sub(comparisonTime) nextCheck := timeoutDuration - durationUnhealthy + time.Second - if nextCheck > 0 { - nextCheckTimes = append(nextCheckTimes, nextCheck) - } - - return false, minDuration(nextCheckTimes) + return "", "", nil, nextCheck } // check node conditions (only when node is available) + var unhealthyNodeMessages []string for _, c := range t.MHC.Spec.Checks.UnhealthyNodeConditions { nodeCondition := getNodeCondition(t.Node, c.Type) @@ -308,33 +287,10 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi } } - if len(unhealthyNodeMessages) > 0 || len(unhealthyMachineMessages) > 0 { - reason := clusterv1.MachineHealthCheckUnhealthyNodeReason - v1beta1Reason := clusterv1.UnhealthyNodeConditionV1Beta1Reason - if len(unhealthyMachineMessages) > 0 { - reason = clusterv1.MachineHealthCheckUnhealthyMachineReason - v1beta1Reason = clusterv1.UnhealthyMachineConditionV1Beta1Reason - } - - // Combine all messages into a single comprehensive message - allMessages := append(unhealthyNodeMessages, unhealthyMachineMessages...) - // For v1beta2 we use a single-line message prefixed with "Health check failed: " - // for compatibility with existing tests and other condition messages. - conditionMessage := fmt.Sprintf("Health check failed: %s", strings.Join(allMessages, "; ")) - conditions.Set(t.Machine, metav1.Condition{ - Type: clusterv1.MachineHealthCheckSucceededCondition, - Status: metav1.ConditionFalse, - Reason: reason, - Message: conditionMessage, - }) - - // For v1beta1 keep the existing semicolon-separated format (no prefix). - v1beta1conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededV1Beta1Condition, v1beta1Reason, clusterv1.ConditionSeverityWarning, "%s", strings.Join(allMessages, "; ")) - - return true, time.Duration(0) + if len(unhealthyNodeMessages) > 0 { + return clusterv1.MachineHealthCheckUnhealthyNodeReason, clusterv1.UnhealthyNodeConditionV1Beta1Reason, unhealthyNodeMessages, time.Duration(0) } - - return false, minDuration(nextCheckTimes) + return "", "", nil, minDuration(nextCheckTimes) } // getTargetsFromMHC uses the MachineHealthCheck's selector to fetch machines @@ -444,14 +400,6 @@ func (r *Reconciler) healthCheckTargets(targets []healthCheckTarget, logger logr if nextCheck > 0 { logger.V(3).Info("Target is likely to go unhealthy", "timeUntilUnhealthy", nextCheck.Truncate(time.Second).String()) - r.recorder.Eventf( - t.Machine, - corev1.EventTypeNormal, - EventDetectedUnhealthy, - "Machine %s (Node %s) is failing machine health check rules and it is likely to go unhealthy", - klog.KObj(t.Machine), - t.nodeName(), - ) nextCheckTimes = append(nextCheckTimes, nextCheck) continue } diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go index 16a4029dcaa0..0e98bacd7abd 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go @@ -289,7 +289,7 @@ func TestHealthCheckTargets(t *testing.T) { Node: nil, } nodeNotYetStartedTarget1200sCondition := newFailedHealthCheckV1Beta1Condition(clusterv1.NodeStartupTimeoutV1Beta1Reason, "Node failed to report startup in %s", timeoutForMachineToHaveNode) - nodeNotYetStartedTarget1200sV1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckNodeStartupTimeoutReason, "Health check failed: Node failed to report startup in %s", timeoutForMachineToHaveNode) + nodeNotYetStartedTarget1200sV1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckNodeStartupTimeoutReason, "Health check failed:\n * Node failed to report startup in %s", timeoutForMachineToHaveNode) testMachineCreated400s := testMachine.DeepCopy() nowMinus400s := metav1.NewTime(time.Now().Add(-400 * time.Second)) @@ -311,7 +311,7 @@ func TestHealthCheckTargets(t *testing.T) { nodeMissing: true, } nodeGoneAwayCondition := newFailedHealthCheckV1Beta1Condition(clusterv1.NodeNotFoundV1Beta1Reason, "") - nodeGoneAwayV1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckNodeDeletedReason, "Health check failed: Node %s has been deleted", testMachine.Status.NodeRef.Name) + nodeGoneAwayV1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckNodeDeletedReason, "Health check failed:\n * Node %s has been deleted", testMachine.Status.NodeRef.Name) // Create a test MHC without conditions testMHCEmptyConditions := &clusterv1.MachineHealthCheck{ @@ -391,7 +391,7 @@ func TestHealthCheckTargets(t *testing.T) { nodeMissing: false, } nodeUnknown400Condition := newFailedHealthCheckV1Beta1Condition(clusterv1.UnhealthyNodeConditionV1Beta1Reason, "Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String()) - nodeUnknown400V1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckUnhealthyNodeReason, "Health check failed: Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String()) + nodeUnknown400V1Beta2Condition := newFailedHealthCheckCondition(clusterv1.MachineHealthCheckUnhealthyNodeReason, "Health check failed:\n * Condition Ready on Node is reporting status Unknown for more than %s", (time.Duration(timeoutForUnhealthyNodeConditions) * time.Second).String()) // Target for when a node is healthy testNodeHealthy := newTestNode("node1") @@ -430,7 +430,7 @@ func TestHealthCheckTargets(t *testing.T) { ) machineUnhealthy400V1Beta2Condition := newFailedHealthCheckCondition( clusterv1.MachineHealthCheckUnhealthyMachineReason, - "Health check failed: Condition EtcdPodHealthy on Machine is reporting status False for more than %s", + "Health check failed:\n * Condition EtcdPodHealthy on Machine is reporting status False for more than %s", (time.Duration(timeoutForUnhealthyMachineConditions) * time.Second).String(), )