Skip to content

Commit af68d6e

Browse files
Add support for checking Machine conditions in MachineHealthCheck
MachineHealthCheck currently only allows checking Node conditions to validate if a machine is healthy. However, machine conditions capture conditions that do not exist on nodes, for example, control plane node conditions such as EtcdPodHealthy, SchedulerPodHealthy that can indicate if a controlplane machine has been created correctly. Adding support for Machine conditions enables us to perform remediation during control plane upgrades. This PR introduces a new field as part of the MachineHealthCheckChecks: - `UnhealthyMachineConditions` This will mirror the behavior of `UnhealthyNodeConditions` but the MachineHealthCheck controller will instead check the machine conditions. This reimplements and extends the work originally proposed by @justinmir in PR #12275. Co-authored-by: Justin Miron <[email protected]> Signed-off-by: Furkat Gofurov <[email protected]>
1 parent 7a28275 commit af68d6e

38 files changed

+1552
-40
lines changed

api/core/v1beta1/conversion_test.go

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ func ClusterFuzzFuncs(_ runtimeserializer.CodecFactory) []interface{} {
9999
hubClusterVariable,
100100
hubFailureDomain,
101101
hubUnhealthyNodeCondition,
102+
hubUnhealthyMachineCondition,
102103
spokeCluster,
103104
spokeClusterTopology,
104105
spokeObjectReference,
@@ -126,6 +127,17 @@ func hubClusterSpec(in *clusterv1.ClusterSpec, c randfill.Continue) {
126127
in.ControlPlaneRef.APIGroup = gvk.Group
127128
in.ControlPlaneRef.Kind = gvk.Kind
128129
}
130+
131+
// remove MachineHealthCheck.UnhealthyMachineConditions as it does not exist in v1beta1.
132+
if in.Topology.IsDefined() && in.Topology.ControlPlane.HealthCheck.IsDefined() {
133+
in.Topology.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = nil
134+
}
135+
136+
if in.Topology.IsDefined() && len(in.Topology.Workers.MachineDeployments) > 0 {
137+
for i := range in.Topology.Workers.MachineDeployments {
138+
in.Topology.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = nil
139+
}
140+
}
129141
}
130142

131143
func hubClusterStatus(in *clusterv1.ClusterStatus, c randfill.Continue) {
@@ -177,6 +189,14 @@ func hubUnhealthyNodeCondition(in *clusterv1.UnhealthyNodeCondition, c randfill.
177189
}
178190
}
179191

192+
func hubUnhealthyMachineCondition(in *clusterv1.UnhealthyMachineCondition, c randfill.Continue) {
193+
c.FillNoCustom(in)
194+
195+
if in.TimeoutSeconds == nil {
196+
in.TimeoutSeconds = ptr.To(int32(0)) // TimeoutSeconds is a required field and nil does not round trip
197+
}
198+
}
199+
180200
func spokeCluster(in *Cluster, c randfill.Continue) {
181201
c.FillNoCustom(in)
182202

@@ -267,12 +287,14 @@ func spokeClusterVariable(in *ClusterVariable, c randfill.Continue) {
267287

268288
func ClusterClassFuncs(_ runtimeserializer.CodecFactory) []interface{} {
269289
return []interface{}{
290+
hubClusterClassSpec,
270291
hubClusterClassVariable,
271292
hubClusterClassStatusVariableDefinition,
272293
hubClusterClassStatus,
273294
hubJSONPatch,
274295
hubJSONSchemaProps,
275296
hubUnhealthyNodeCondition,
297+
hubUnhealthyMachineCondition,
276298
spokeClusterClass,
277299
spokeObjectReference,
278300
spokeClusterClassStatus,
@@ -287,6 +309,21 @@ func ClusterClassFuncs(_ runtimeserializer.CodecFactory) []interface{} {
287309
}
288310
}
289311

312+
func hubClusterClassSpec(in *clusterv1.ClusterClassSpec, c randfill.Continue) {
313+
c.FillNoCustom(in)
314+
315+
// remove MachineHealthCheck.UnhealthyMachineConditions as it does not exist in v1beta1.
316+
if in.ControlPlane.HealthCheck.IsDefined() && in.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions != nil {
317+
in.ControlPlane.HealthCheck.Checks.UnhealthyMachineConditions = nil
318+
}
319+
320+
if len(in.Workers.MachineDeployments) > 0 {
321+
for i := range in.Workers.MachineDeployments {
322+
in.Workers.MachineDeployments[i].HealthCheck.Checks.UnhealthyMachineConditions = nil
323+
}
324+
}
325+
}
326+
290327
func hubClusterClassVariable(in *clusterv1.ClusterClassVariable, c randfill.Continue) {
291328
c.FillNoCustom(in)
292329

@@ -728,7 +765,9 @@ func spokeMachineDeploymentStatus(in *MachineDeploymentStatus, c randfill.Contin
728765
func MachineHealthCheckFuzzFuncs(_ runtimeserializer.CodecFactory) []interface{} {
729766
return []interface{}{
730767
hubUnhealthyNodeCondition,
768+
hubUnhealthyMachineCondition,
731769
hubMachineHealthCheckStatus,
770+
hubMachineHealthCheckSpec,
732771
spokeMachineHealthCheck,
733772
spokeMachineHealthCheckSpec,
734773
spokeObjectReference,
@@ -737,6 +776,14 @@ func MachineHealthCheckFuzzFuncs(_ runtimeserializer.CodecFactory) []interface{}
737776
}
738777
}
739778

779+
func hubMachineHealthCheckSpec(in *clusterv1.MachineHealthCheckSpec, c randfill.Continue) {
780+
c.FillNoCustom(in)
781+
782+
if in.Checks.UnhealthyMachineConditions != nil {
783+
in.Checks.UnhealthyMachineConditions = nil
784+
}
785+
}
786+
740787
func hubMachineHealthCheckStatus(in *clusterv1.MachineHealthCheckStatus, c randfill.Continue) {
741788
c.FillNoCustom(in)
742789
// Drop empty structs with only omit empty fields.

api/core/v1beta2/cluster_types.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,16 @@ type ControlPlaneTopologyHealthCheckChecks struct {
725725
// +kubebuilder:validation:MinItems=1
726726
// +kubebuilder:validation:MaxItems=100
727727
UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"`
728+
729+
// unhealthyMachineConditions contains a list of the machine conditions that determine
730+
// whether a machine is considered unhealthy. The conditions are combined in a
731+
// logical OR, i.e. if any of the conditions is met, the machine is unhealthy.
732+
//
733+
// +optional
734+
// +listType=atomic
735+
// +kubebuilder:validation:MinItems=1
736+
// +kubebuilder:validation:MaxItems=100
737+
UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"`
728738
}
729739

730740
// ControlPlaneTopologyHealthCheckRemediation configures if and how remediations are triggered if a control plane Machine is unhealthy.
@@ -975,6 +985,16 @@ type MachineDeploymentTopologyHealthCheckChecks struct {
975985
// +kubebuilder:validation:MinItems=1
976986
// +kubebuilder:validation:MaxItems=100
977987
UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"`
988+
989+
// unhealthyMachineConditions contains a list of the machine conditions that determine
990+
// whether a machine is considered unhealthy. The conditions are combined in a
991+
// logical OR, i.e. if any of the conditions is met, the machine is unhealthy.
992+
//
993+
// +optional
994+
// +listType=atomic
995+
// +kubebuilder:validation:MinItems=1
996+
// +kubebuilder:validation:MaxItems=100
997+
UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"`
978998
}
979999

9801000
// MachineDeploymentTopologyHealthCheckRemediation configures if and how remediations are triggered if a MachineDeployment Machine is unhealthy.

api/core/v1beta2/clusterclass_types.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,16 @@ type ControlPlaneClassHealthCheckChecks struct {
281281
// +kubebuilder:validation:MinItems=1
282282
// +kubebuilder:validation:MaxItems=100
283283
UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"`
284+
285+
// unhealthyMachineConditions contains a list of the machine conditions that determine
286+
// whether a machine is considered unhealthy. The conditions are combined in a
287+
// logical OR, i.e. if any of the conditions is met, the machine is unhealthy.
288+
//
289+
// +optional
290+
// +listType=atomic
291+
// +kubebuilder:validation:MinItems=1
292+
// +kubebuilder:validation:MaxItems=100
293+
UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"`
284294
}
285295

286296
// ControlPlaneClassHealthCheckRemediation configures if and how remediations are triggered if a control plane Machine is unhealthy.
@@ -542,6 +552,16 @@ type MachineDeploymentClassHealthCheckChecks struct {
542552
// +kubebuilder:validation:MinItems=1
543553
// +kubebuilder:validation:MaxItems=100
544554
UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"`
555+
556+
// unhealthyMachineConditions contains a list of the machine conditions that determine
557+
// whether a machine is considered unhealthy. The conditions are combined in a
558+
// logical OR, i.e. if any of the conditions is met, the machine is unhealthy.
559+
//
560+
// +optional
561+
// +listType=atomic
562+
// +kubebuilder:validation:MinItems=1
563+
// +kubebuilder:validation:MaxItems=100
564+
UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"`
545565
}
546566

547567
// MachineDeploymentClassHealthCheckRemediation configures if and how remediations are triggered if a MachineDeployment Machine is unhealthy.

api/core/v1beta2/machine_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,10 @@ const (
276276
// defined by a MachineHealthCheck object.
277277
MachineHealthCheckUnhealthyNodeReason = "UnhealthyNode"
278278

279+
// MachineHealthCheckUnhealthyMachineReason surfaces when the machine does not pass the health checks
280+
// defined by a MachineHealthCheck object.
281+
MachineHealthCheckUnhealthyMachineReason = "UnhealthyMachine"
282+
279283
// MachineHealthCheckNodeStartupTimeoutReason surfaces when the node hosted on the machine does not appear within
280284
// the timeout defined by a MachineHealthCheck object.
281285
MachineHealthCheckNodeStartupTimeoutReason = "NodeStartupTimeout"

api/core/v1beta2/machinehealthcheck_types.go

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,16 @@ type MachineHealthCheckChecks struct {
111111
// +kubebuilder:validation:MinItems=1
112112
// +kubebuilder:validation:MaxItems=100
113113
UnhealthyNodeConditions []UnhealthyNodeCondition `json:"unhealthyNodeConditions,omitempty"`
114+
115+
// unhealthyMachineConditions contains a list of the machine conditions that determine
116+
// whether a machine is considered unhealthy. The conditions are combined in a
117+
// logical OR, i.e. if any of the conditions is met, the machine is unhealthy.
118+
//
119+
// +optional
120+
// +listType=atomic
121+
// +kubebuilder:validation:MinItems=1
122+
// +kubebuilder:validation:MaxItems=100
123+
UnhealthyMachineConditions []UnhealthyMachineCondition `json:"unhealthyMachineConditions,omitempty"`
114124
}
115125

116126
// MachineHealthCheckRemediation configures if and how remediations are triggered if a Machine is unhealthy.
@@ -234,6 +244,31 @@ type UnhealthyNodeCondition struct {
234244
TimeoutSeconds *int32 `json:"timeoutSeconds,omitempty"`
235245
}
236246

247+
// UnhealthyMachineCondition represents a Machine condition type and value with a timeout
248+
// specified as a duration. When the named condition has been in the given
249+
// status for at least the timeout value, a machine is considered unhealthy.
250+
type UnhealthyMachineCondition struct {
251+
// type of Machine condition
252+
// +kubebuilder:validation:Type=string
253+
// +kubebuilder:validation:MinLength=1
254+
// +kubebuilder:validation:MaxLength=316
255+
// +required
256+
Type string `json:"type"`
257+
258+
// status of the condition, one of True, False, Unknown.
259+
// +required
260+
// +kubebuilder:validation:Enum=True;False;Unknown
261+
Status metav1.ConditionStatus `json:"status"`
262+
263+
// timeoutSeconds is the duration that a machine must be in a given status for,
264+
// after which the machine is considered unhealthy.
265+
// For example, with a value of "1h", the machine must match the status
266+
// for at least 1 hour before being considered unhealthy.
267+
// +required
268+
// +kubebuilder:validation:Minimum=0
269+
TimeoutSeconds *int32 `json:"timeoutSeconds,omitempty"`
270+
}
271+
237272
// MachineHealthCheckStatus defines the observed state of MachineHealthCheck.
238273
// +kubebuilder:validation:MinProperties=1
239274
type MachineHealthCheckStatus struct {

api/core/v1beta2/v1beta1_condition_consts.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,9 @@ const (
157157

158158
// UnhealthyNodeConditionV1Beta1Reason is the reason used when a machine's node has one of the MachineHealthCheck's unhealthy conditions.
159159
UnhealthyNodeConditionV1Beta1Reason = "UnhealthyNode"
160+
161+
// UnhealthyMachineConditionV1Beta1Reason is the reason used when a machine has one of the MachineHealthCheck's unhealthy conditions.
162+
UnhealthyMachineConditionV1Beta1Reason = "UnhealthyMachine"
160163
)
161164

162165
const (

api/core/v1beta2/zz_generated.deepcopy.go

Lines changed: 55 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)