diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index efe16d9094b6..6c16276f63df 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -51,12 +51,12 @@ type ControlPlane struct { machinesPatchHelpers map[string]*patch.Helper // MachinesNotUpToDate is the source of truth for Machines that are not up-to-date. - // It should be used to check if a Machine is up-to-date (not machinesNotUpToDateResults). + // It should be used to check if a Machine is up-to-date (not machinesUpToDateResults). MachinesNotUpToDate collections.Machines - // machinesNotUpToDateResults is used to store the result of the UpToDate call for all Machines + // machinesUpToDateResults is used to store the result of the UpToDate call for all Machines // (even for Machines that are up-to-date). // MachinesNotUpToDate should always be used instead to check if a Machine is up-to-date. - machinesNotUpToDateResults map[string]NotUpToDateResult + machinesUpToDateResults map[string]UpToDateResult // reconciliationTime is the time of the current reconciliation, and should be used for all "now" calculations reconciliationTime metav1.Time @@ -122,9 +122,9 @@ func NewControlPlane(ctx context.Context, managementCluster ManagementCluster, c // Select machines that should be rolled out because of an outdated configuration or because rolloutAfter/Before expired. reconciliationTime := metav1.Now() machinesNotUptoDate := make(collections.Machines, len(ownedMachines)) - machinesNotUpToDateResults := map[string]NotUpToDateResult{} + machinesUpToDateResults := map[string]UpToDateResult{} for _, m := range ownedMachines { - upToDate, notUpToDateResult, err := UpToDate(ctx, client, cluster, m, kcp, &reconciliationTime, infraMachines, kubeadmConfigs) + upToDate, upToDateResult, err := UpToDate(ctx, client, cluster, m, kcp, &reconciliationTime, infraMachines, kubeadmConfigs) if err != nil { return nil, err } @@ -133,20 +133,20 @@ func NewControlPlane(ctx context.Context, managementCluster ManagementCluster, c } // Set this even if machine is UpToDate. This is needed to complete triggering in-place updates // MachinesNotUpToDate should always be used instead to check if a Machine is up-to-date. - machinesNotUpToDateResults[m.Name] = *notUpToDateResult + machinesUpToDateResults[m.Name] = *upToDateResult } return &ControlPlane{ - KCP: kcp, - Cluster: cluster, - Machines: ownedMachines, - machinesPatchHelpers: patchHelpers, - MachinesNotUpToDate: machinesNotUptoDate, - machinesNotUpToDateResults: machinesNotUpToDateResults, - KubeadmConfigs: kubeadmConfigs, - InfraResources: infraMachines, - reconciliationTime: reconciliationTime, - managementCluster: managementCluster, + KCP: kcp, + Cluster: cluster, + Machines: ownedMachines, + machinesPatchHelpers: patchHelpers, + MachinesNotUpToDate: machinesNotUptoDate, + machinesUpToDateResults: machinesUpToDateResults, + KubeadmConfigs: kubeadmConfigs, + InfraResources: infraMachines, + reconciliationTime: reconciliationTime, + managementCluster: managementCluster, }, nil } @@ -240,15 +240,15 @@ func (c *ControlPlane) GetKubeadmConfig(machineName string) (*bootstrapv1.Kubead } // MachinesNeedingRollout return a list of machines that need to be rolled out. -func (c *ControlPlane) MachinesNeedingRollout() (collections.Machines, map[string]NotUpToDateResult) { +func (c *ControlPlane) MachinesNeedingRollout() (collections.Machines, map[string]UpToDateResult) { // Note: Machines already deleted are dropped because they will be replaced by new machines after deletion completes. - return c.MachinesNotUpToDate.Filter(collections.Not(collections.HasDeletionTimestamp)), c.machinesNotUpToDateResults + return c.MachinesNotUpToDate.Filter(collections.Not(collections.HasDeletionTimestamp)), c.machinesUpToDateResults } // NotUpToDateMachines return a list of machines that are not up to date with the control // plane's configuration. -func (c *ControlPlane) NotUpToDateMachines() (collections.Machines, map[string]NotUpToDateResult) { - return c.MachinesNotUpToDate, c.machinesNotUpToDateResults +func (c *ControlPlane) NotUpToDateMachines() (collections.Machines, map[string]UpToDateResult) { + return c.MachinesNotUpToDate, c.machinesUpToDateResults } // UpToDateMachines returns the machines that are up to date with the control diff --git a/controlplane/kubeadm/internal/control_plane_test.go b/controlplane/kubeadm/internal/control_plane_test.go index ea4cc09a93cd..c28c6000fabb 100644 --- a/controlplane/kubeadm/internal/control_plane_test.go +++ b/controlplane/kubeadm/internal/control_plane_test.go @@ -122,19 +122,19 @@ func TestControlPlane(t *testing.T) { g.Expect(controlPlane.Machines).To(HaveLen(5)) - machinesNotUptoDate, machinesNotUpToDateResults := controlPlane.NotUpToDateMachines() + machinesNotUptoDate, machinesUpToDateResults := controlPlane.NotUpToDateMachines() g.Expect(machinesNotUptoDate.Names()).To(ConsistOf("m2", "m3")) - // machinesNotUpToDateResults contains results for all Machines (including up-to-date Machines). - g.Expect(machinesNotUpToDateResults).To(HaveLen(5)) - g.Expect(machinesNotUpToDateResults["m2"].ConditionMessages).To(Equal([]string{"Version v1.29.0, v1.31.0 required"})) - g.Expect(machinesNotUpToDateResults["m3"].ConditionMessages).To(Equal([]string{"Version v1.29.3, v1.31.0 required"})) + // machinesUpToDateResults contains results for all Machines (including up-to-date Machines). + g.Expect(machinesUpToDateResults).To(HaveLen(5)) + g.Expect(machinesUpToDateResults["m2"].ConditionMessages).To(Equal([]string{"Version v1.29.0, v1.31.0 required"})) + g.Expect(machinesUpToDateResults["m3"].ConditionMessages).To(Equal([]string{"Version v1.29.3, v1.31.0 required"})) - machinesNeedingRollout, machinesNotUpToDateResults := controlPlane.MachinesNeedingRollout() + machinesNeedingRollout, machinesUpToDateResults := controlPlane.MachinesNeedingRollout() g.Expect(machinesNeedingRollout.Names()).To(ConsistOf("m2")) - // machinesNotUpToDateResults contains results for all Machines (including up-to-date Machines). - g.Expect(machinesNotUpToDateResults).To(HaveLen(5)) - g.Expect(machinesNotUpToDateResults["m2"].LogMessages).To(Equal([]string{"Machine version \"v1.29.0\" is not equal to KCP version \"v1.31.0\""})) - g.Expect(machinesNotUpToDateResults["m3"].LogMessages).To(Equal([]string{"Machine version \"v1.29.3\" is not equal to KCP version \"v1.31.0\""})) + // machinesUpToDateResults contains results for all Machines (including up-to-date Machines). + g.Expect(machinesUpToDateResults).To(HaveLen(5)) + g.Expect(machinesUpToDateResults["m2"].LogMessages).To(Equal([]string{"Machine version \"v1.29.0\" is not equal to KCP version \"v1.31.0\""})) + g.Expect(machinesUpToDateResults["m3"].LogMessages).To(Equal([]string{"Machine version \"v1.29.3\" is not equal to KCP version \"v1.31.0\""})) upToDateMachines := controlPlane.UpToDateMachines() g.Expect(upToDateMachines).To(HaveLen(3)) diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go index 1bcda359a032..32735bec542b 100644 --- a/controlplane/kubeadm/internal/controllers/controller.go +++ b/controlplane/kubeadm/internal/controllers/controller.go @@ -95,6 +95,11 @@ type KubeadmControlPlaneReconciler struct { managementCluster internal.ManagementCluster managementClusterUncached internal.ManagementCluster ssaCache ssa.Cache + + // Only used for testing + overrideTryInPlaceUpdateFunc func(ctx context.Context, controlPlane *internal.ControlPlane, machineToInPlaceUpdate *clusterv1.Machine, machineUpToDateResult internal.UpToDateResult) (bool, ctrl.Result, error) + overrideScaleUpControlPlaneFunc func(ctx context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) + overrideScaleDownControlPlaneFunc func(ctx context.Context, controlPlane *internal.ControlPlane, machineToDelete *clusterv1.Machine) (ctrl.Result, error) } func (r *KubeadmControlPlaneReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { @@ -467,16 +472,16 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl } // Control plane machines rollout due to configuration changes (e.g. upgrades) takes precedence over other operations. - machinesNeedingRollout, machinesNeedingRolloutResults := controlPlane.MachinesNeedingRollout() + machinesNeedingRollout, machinesUpToDateResults := controlPlane.MachinesNeedingRollout() switch { case len(machinesNeedingRollout) > 0: var allMessages []string - for machine, machinesNeedingRolloutResult := range machinesNeedingRolloutResults { - allMessages = append(allMessages, fmt.Sprintf("Machine %s needs rollout: %s", machine, strings.Join(machinesNeedingRolloutResult.LogMessages, ","))) + for machine, machineUpToDateResult := range machinesUpToDateResults { + allMessages = append(allMessages, fmt.Sprintf("Machine %s needs rollout: %s", machine, strings.Join(machineUpToDateResult.LogMessages, ","))) } log.Info(fmt.Sprintf("Rolling out Control Plane machines: %s", strings.Join(allMessages, ",")), "machinesNeedingRollout", machinesNeedingRollout.Names()) v1beta1conditions.MarkFalse(controlPlane.KCP, controlplanev1.MachinesSpecUpToDateV1Beta1Condition, controlplanev1.RollingUpdateInProgressV1Beta1Reason, clusterv1.ConditionSeverityWarning, "Rolling %d replicas with outdated spec (%d replicas up to date)", len(machinesNeedingRollout), len(controlPlane.Machines)-len(machinesNeedingRollout)) - return r.upgradeControlPlane(ctx, controlPlane, machinesNeedingRollout) + return r.updateControlPlane(ctx, controlPlane, machinesNeedingRollout, machinesUpToDateResults) default: // make sure last upgrade operation is marked as completed. // NOTE: we are checking the condition already exists in order to avoid to set this condition at the first @@ -506,7 +511,12 @@ func (r *KubeadmControlPlaneReconciler) reconcile(ctx context.Context, controlPl case numMachines > desiredReplicas: log.Info("Scaling down control plane", "desired", desiredReplicas, "existing", numMachines) // The last parameter (i.e. machines needing to be rolled out) should always be empty here. - return r.scaleDownControlPlane(ctx, controlPlane, collections.Machines{}) + // Pick the Machine that we should scale down. + machineToDelete, err := selectMachineForInPlaceUpdateOrScaleDown(ctx, controlPlane, collections.Machines{}) + if err != nil { + return ctrl.Result{}, errors.Wrap(err, "failed to select machine for scale down") + } + return r.scaleDownControlPlane(ctx, controlPlane, machineToDelete) } // Get the workload cluster client. @@ -975,16 +985,16 @@ func (r *KubeadmControlPlaneReconciler) reconcileControlPlaneAndMachinesConditio } func reconcileMachineUpToDateCondition(_ context.Context, controlPlane *internal.ControlPlane) { - machinesNotUptoDate, machinesNotUpToDateResults := controlPlane.NotUpToDateMachines() + machinesNotUptoDate, machinesUpToDateResults := controlPlane.NotUpToDateMachines() machinesNotUptoDateNames := sets.New(machinesNotUptoDate.Names()...) for _, machine := range controlPlane.Machines { if machinesNotUptoDateNames.Has(machine.Name) { // Note: the code computing the message for KCP's RolloutOut condition is making assumptions on the format/content of this message. message := "" - if machinesNotUpToDateResult, ok := machinesNotUpToDateResults[machine.Name]; ok && len(machinesNotUpToDateResult.ConditionMessages) > 0 { + if machineUpToDateResult, ok := machinesUpToDateResults[machine.Name]; ok && len(machineUpToDateResult.ConditionMessages) > 0 { var reasons []string - for _, conditionMessage := range machinesNotUpToDateResult.ConditionMessages { + for _, conditionMessage := range machineUpToDateResult.ConditionMessages { reasons = append(reasons, fmt.Sprintf("* %s", conditionMessage)) } message = strings.Join(reasons, "\n") diff --git a/controlplane/kubeadm/internal/controllers/inplace.go b/controlplane/kubeadm/internal/controllers/inplace.go new file mode 100644 index 000000000000..3e67f66ea7ac --- /dev/null +++ b/controlplane/kubeadm/internal/controllers/inplace.go @@ -0,0 +1,40 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + + ctrl "sigs.k8s.io/controller-runtime" + + clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" + "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" +) + +func (r *KubeadmControlPlaneReconciler) tryInPlaceUpdate( + ctx context.Context, + controlPlane *internal.ControlPlane, + machineToInPlaceUpdate *clusterv1.Machine, + machineUpToDateResult internal.UpToDateResult, +) (fallbackToScaleDown bool, _ ctrl.Result, _ error) { + if r.overrideTryInPlaceUpdateFunc != nil { + return r.overrideTryInPlaceUpdateFunc(ctx, controlPlane, machineToInPlaceUpdate, machineUpToDateResult) + } + + // Always fallback to scale down until in-place is implemented. + return true, ctrl.Result{}, nil +} diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index e153086fc923..671d9de75ea3 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -63,6 +63,10 @@ func (r *KubeadmControlPlaneReconciler) initializeControlPlane(ctx context.Conte } func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, controlPlane *internal.ControlPlane) (ctrl.Result, error) { + if r.overrideScaleUpControlPlaneFunc != nil { + return r.overrideScaleUpControlPlaneFunc(ctx, controlPlane) + } + log := ctrl.LoggerFrom(ctx) // Run preflight checks to ensure that the control plane is stable before proceeding with a scale up/scale down operation; if not, wait. @@ -95,16 +99,14 @@ func (r *KubeadmControlPlaneReconciler) scaleUpControlPlane(ctx context.Context, func (r *KubeadmControlPlaneReconciler) scaleDownControlPlane( ctx context.Context, controlPlane *internal.ControlPlane, - outdatedMachines collections.Machines, + machineToDelete *clusterv1.Machine, ) (ctrl.Result, error) { - log := ctrl.LoggerFrom(ctx) - - // Pick the Machine that we should scale down. - machineToDelete, err := selectMachineForScaleDown(ctx, controlPlane, outdatedMachines) - if err != nil { - return ctrl.Result{}, errors.Wrap(err, "failed to select machine for scale down") + if r.overrideScaleDownControlPlaneFunc != nil { + return r.overrideScaleDownControlPlaneFunc(ctx, controlPlane, machineToDelete) } + log := ctrl.LoggerFrom(ctx) + // Run preflight checks ensuring the control plane is stable before proceeding with a scale up/scale down operation; if not, wait. // Given that we're scaling down, we can exclude the machineToDelete from the preflight checks. if result, err := r.preflightChecks(ctx, controlPlane, machineToDelete); err != nil || !result.IsZero() { @@ -265,7 +267,8 @@ func preflightCheckCondition(kind string, obj *clusterv1.Machine, conditionType return nil } -// selectMachineForScaleDown select a machine candidate for scaling down. The selection is a two phase process: +// selectMachineForInPlaceUpdateOrScaleDown select a machine candidate for scaling down or for in-place update. +// The selection is a two phase process: // // In the first phase it selects a subset of machines eligible for deletion: // - if there are outdated machines with the delete machine annotation, use them as eligible subset (priority to user requests, part 1) @@ -276,18 +279,20 @@ func preflightCheckCondition(kind string, obj *clusterv1.Machine, conditionType // // Once the subset of machines eligible for deletion is identified, one machine is picked out of this subset by // selecting the machine in the failure domain with most machines (including both eligible and not eligible machines). -func selectMachineForScaleDown(ctx context.Context, controlPlane *internal.ControlPlane, outdatedMachines collections.Machines) (*clusterv1.Machine, error) { +func selectMachineForInPlaceUpdateOrScaleDown(ctx context.Context, controlPlane *internal.ControlPlane, outdatedMachines collections.Machines) (*clusterv1.Machine, error) { // Select the subset of machines eligible for scale down. - eligibleMachines := controlPlane.Machines + var eligibleMachines collections.Machines switch { case controlPlane.MachineWithDeleteAnnotation(outdatedMachines).Len() > 0: eligibleMachines = controlPlane.MachineWithDeleteAnnotation(outdatedMachines) - case controlPlane.MachineWithDeleteAnnotation(eligibleMachines).Len() > 0: - eligibleMachines = controlPlane.MachineWithDeleteAnnotation(eligibleMachines) + case controlPlane.MachineWithDeleteAnnotation(controlPlane.Machines).Len() > 0: + eligibleMachines = controlPlane.MachineWithDeleteAnnotation(controlPlane.Machines) case controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines).Len() > 0: eligibleMachines = controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines) case outdatedMachines.Len() > 0: eligibleMachines = outdatedMachines + default: + eligibleMachines = controlPlane.Machines } // Pick an eligible machine from the failure domain with most machines in (including both eligible and not eligible machines) diff --git a/controlplane/kubeadm/internal/controllers/scale_test.go b/controlplane/kubeadm/internal/controllers/scale_test.go index 7650de735267..728a6bb6ae1a 100644 --- a/controlplane/kubeadm/internal/controllers/scale_test.go +++ b/controlplane/kubeadm/internal/controllers/scale_test.go @@ -284,7 +284,9 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing. } controlPlane.InjectTestManagementCluster(r.managementCluster) - result, err := r.scaleDownControlPlane(context.Background(), controlPlane, controlPlane.Machines) + machineToDelete, err := selectMachineForInPlaceUpdateOrScaleDown(ctx, controlPlane, controlPlane.Machines) + g.Expect(err).ToNot(HaveOccurred()) + result, err := r.scaleDownControlPlane(context.Background(), controlPlane, machineToDelete) g.Expect(err).ToNot(HaveOccurred()) g.Expect(result).To(BeComparableTo(ctrl.Result{Requeue: true})) @@ -326,7 +328,9 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing. } controlPlane.InjectTestManagementCluster(r.managementCluster) - result, err := r.scaleDownControlPlane(context.Background(), controlPlane, controlPlane.Machines) + machineToDelete, err := selectMachineForInPlaceUpdateOrScaleDown(ctx, controlPlane, controlPlane.Machines) + g.Expect(err).ToNot(HaveOccurred()) + result, err := r.scaleDownControlPlane(context.Background(), controlPlane, machineToDelete) g.Expect(err).ToNot(HaveOccurred()) g.Expect(result).To(BeComparableTo(ctrl.Result{Requeue: true})) @@ -364,7 +368,9 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing. } controlPlane.InjectTestManagementCluster(r.managementCluster) - result, err := r.scaleDownControlPlane(context.Background(), controlPlane, controlPlane.Machines) + machineToDelete, err := selectMachineForInPlaceUpdateOrScaleDown(ctx, controlPlane, controlPlane.Machines) + g.Expect(err).ToNot(HaveOccurred()) + result, err := r.scaleDownControlPlane(context.Background(), controlPlane, machineToDelete) g.Expect(err).ToNot(HaveOccurred()) g.Expect(result).To(BeComparableTo(ctrl.Result{RequeueAfter: preflightFailedRequeueAfter})) @@ -374,7 +380,7 @@ func TestKubeadmControlPlaneReconciler_scaleDownControlPlane_NoError(t *testing. }) } -func TestSelectMachineForScaleDown(t *testing.T) { +func TestSelectMachineForInPlaceUpdateOrScaleDown(t *testing.T) { kcp := controlplanev1.KubeadmControlPlane{ Spec: controlplanev1.KubeadmControlPlaneSpec{}, } @@ -503,7 +509,7 @@ func TestSelectMachineForScaleDown(t *testing.T) { t.Run(tc.name, func(t *testing.T) { g := NewWithT(t) - selectedMachine, err := selectMachineForScaleDown(ctx, tc.cp, tc.outDatedMachines) + selectedMachine, err := selectMachineForInPlaceUpdateOrScaleDown(ctx, tc.cp, tc.outDatedMachines) if tc.expectErr { g.Expect(err).To(HaveOccurred()) diff --git a/controlplane/kubeadm/internal/controllers/upgrade.go b/controlplane/kubeadm/internal/controllers/upgrade.go index 7768c03877b8..a85076fddb8a 100644 --- a/controlplane/kubeadm/internal/controllers/upgrade.go +++ b/controlplane/kubeadm/internal/controllers/upgrade.go @@ -27,13 +27,15 @@ import ( bootstrapv1 "sigs.k8s.io/cluster-api/api/bootstrap/kubeadm/v1beta2" controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" + "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/util/collections" ) -func (r *KubeadmControlPlaneReconciler) upgradeControlPlane( +func (r *KubeadmControlPlaneReconciler) updateControlPlane( ctx context.Context, controlPlane *internal.ControlPlane, - machinesRequireUpgrade collections.Machines, + machinesNeedingRollout collections.Machines, + machinesUpToDateResults map[string]internal.UpToDateResult, ) (ctrl.Result, error) { log := ctrl.LoggerFrom(ctx) @@ -42,17 +44,17 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane( workloadCluster, err := controlPlane.GetWorkloadCluster(ctx) if err != nil { log.Error(err, "failed to get remote client for workload cluster", "Cluster", klog.KObj(controlPlane.Cluster)) - return ctrl.Result{}, err + return ctrl.Result{}, errors.Wrapf(err, "failed to update control plane") } parsedVersion, err := semver.ParseTolerant(controlPlane.KCP.Spec.Version) if err != nil { - return ctrl.Result{}, errors.Wrapf(err, "failed to parse kubernetes version %q", controlPlane.KCP.Spec.Version) + return ctrl.Result{}, errors.Wrapf(err, "failed to update control plane: failed to parse Kubernetes version %q", controlPlane.KCP.Spec.Version) } // Ensure kubeadm clusterRoleBinding for v1.29+ as per https://github.com/kubernetes/kubernetes/pull/121305 if err := workloadCluster.AllowClusterAdminPermissions(ctx, parsedVersion); err != nil { - return ctrl.Result{}, errors.Wrap(err, "failed to set cluster-admin ClusterRoleBinding for kubeadm") + return ctrl.Result{}, errors.Wrap(err, "failed to update control plane: failed to set cluster-admin ClusterRoleBinding for kubeadm") } kubeadmCMMutators := make([]func(*bootstrapv1.ClusterConfiguration), 0) @@ -81,21 +83,76 @@ func (r *KubeadmControlPlaneReconciler) upgradeControlPlane( // collectively update Kubeadm config map if err = workloadCluster.UpdateClusterConfiguration(ctx, parsedVersion, kubeadmCMMutators...); err != nil { - return ctrl.Result{}, err + return ctrl.Result{}, errors.Wrapf(err, "failed to update control plane") } switch controlPlane.KCP.Spec.Rollout.Strategy.Type { case controlplanev1.RollingUpdateStrategyType: // RolloutStrategy is currently defaulted and validated to be RollingUpdate - // We can ignore MaxUnavailable because we are enforcing health checks before we get here. - maxNodes := *controlPlane.KCP.Spec.Replicas + int32(controlPlane.KCP.Spec.Rollout.Strategy.RollingUpdate.MaxSurge.IntValue()) - if int32(controlPlane.Machines.Len()) < maxNodes { - // scaleUp ensures that we don't continue scaling up while waiting for Machines to have NodeRefs - return r.scaleUpControlPlane(ctx, controlPlane) + res, err := r.rollingUpdate(ctx, controlPlane, machinesNeedingRollout, machinesUpToDateResults) + if err != nil { + return ctrl.Result{}, errors.Wrapf(err, "failed to update control plane") } - return r.scaleDownControlPlane(ctx, controlPlane, machinesRequireUpgrade) + return res, nil default: log.Info("RolloutStrategy type is not set to RollingUpdate, unable to determine the strategy for rolling out machines") return ctrl.Result{}, nil } } + +func (r *KubeadmControlPlaneReconciler) rollingUpdate( + ctx context.Context, + controlPlane *internal.ControlPlane, + machinesNeedingRollout collections.Machines, + machinesUpToDateResults map[string]internal.UpToDateResult, +) (ctrl.Result, error) { + currentReplicas := int32(controlPlane.Machines.Len()) + currentUpToDateReplicas := int32(controlPlane.UpToDateMachines().Len()) + desiredReplicas := *controlPlane.KCP.Spec.Replicas + maxSurge := int32(controlPlane.KCP.Spec.Rollout.Strategy.RollingUpdate.MaxSurge.IntValue()) + // Note: As MaxSurge is validated to be either 0 or 1, maxReplicas will be either desiredReplicas or desiredReplicas+1. + maxReplicas := desiredReplicas + maxSurge + + // If currentReplicas < maxReplicas we have to scale up + // Note: This is done to ensure we have as many Machines as allowed during rollout to maximize fault tolerance. + if currentReplicas < maxReplicas { + // Note: scaleUpControlPlane ensures that we don't continue scaling up while waiting for Machines to have NodeRefs. + return r.scaleUpControlPlane(ctx, controlPlane) + } + + // If currentReplicas >= maxReplicas we have to scale down. + // Note: If we are already at or above the maximum Machines we have to in-place update or delete a Machine + // to make progress with the update (as we cannot create additional new Machines above the maximum). + + // Pick the Machine that we should in-place update or scale down. + machineToInPlaceUpdateOrScaleDown, err := selectMachineForInPlaceUpdateOrScaleDown(ctx, controlPlane, machinesNeedingRollout) + if err != nil { + return ctrl.Result{}, errors.Wrap(err, "failed to select next Machine for rollout") + } + machineUpToDateResult, ok := machinesUpToDateResults[machineToInPlaceUpdateOrScaleDown.Name] + if !ok { + // Note: This should never happen as we store results for all Machines in machinesUpToDateResults. + return ctrl.Result{}, errors.Errorf("failed to check if Machine %s is UpToDate", machineToInPlaceUpdateOrScaleDown.Name) + } + + // If the selected Machine is eligible for in-place update and we don't already have enough up-to-date replicas, try in-place update. + // Note: To be safe we only try an in-place update when we would otherwise delete a Machine. This ensures we could + // afford if the in-place update fails and the Machine becomes unavailable (and eventually MHC kicks in and the Machine is recreated). + if feature.Gates.Enabled(feature.InPlaceUpdates) && + machineUpToDateResult.EligibleForInPlaceUpdate && + currentUpToDateReplicas < desiredReplicas { + fallbackToScaleDown, res, err := r.tryInPlaceUpdate(ctx, controlPlane, machineToInPlaceUpdateOrScaleDown, machineUpToDateResult) + if err != nil { + return ctrl.Result{}, err + } + if !res.IsZero() { + return res, nil + } + if fallbackToScaleDown { + return r.scaleDownControlPlane(ctx, controlPlane, machineToInPlaceUpdateOrScaleDown) + } + // In-place update triggered + return ctrl.Result{}, nil // Note: Requeue is not needed, changes to Machines trigger another reconcile. + } + return r.scaleDownControlPlane(ctx, controlPlane, machineToInPlaceUpdateOrScaleDown) +} diff --git a/controlplane/kubeadm/internal/controllers/upgrade_test.go b/controlplane/kubeadm/internal/controllers/upgrade_test.go index 4ad4440be042..08d981e375fc 100644 --- a/controlplane/kubeadm/internal/controllers/upgrade_test.go +++ b/controlplane/kubeadm/internal/controllers/upgrade_test.go @@ -23,18 +23,23 @@ import ( "time" . "github.com/onsi/gomega" + "github.com/pkg/errors" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" "k8s.io/client-go/tools/record" + utilfeature "k8s.io/component-base/featuregate/testing" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" bootstrapv1 "sigs.k8s.io/cluster-api/api/bootstrap/kubeadm/v1beta2" + controlplanev1 "sigs.k8s.io/cluster-api/api/controlplane/kubeadm/v1beta2" clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal" "sigs.k8s.io/cluster-api/controlplane/kubeadm/internal/desiredstate" + "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/internal/util/ssa" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/collections" @@ -113,7 +118,7 @@ func TestKubeadmControlPlaneReconciler_RolloutStrategy_ScaleUp(t *testing.T) { initialMachine := &clusterv1.MachineList{} g.Eventually(func(g Gomega) { // Nb. This Eventually block also forces the cache to update so that subsequent - // reconcile and upgradeControlPlane calls use the updated cache and avoids flakiness in the test. + // reconcile and updateControlPlane calls use the updated cache and avoids flakiness in the test. g.Expect(env.List(ctx, initialMachine, client.InNamespace(cluster.Namespace))).To(Succeed()) g.Expect(initialMachine.Items).To(HaveLen(1)) }, timeout).Should(Succeed()) @@ -127,7 +132,11 @@ func TestKubeadmControlPlaneReconciler_RolloutStrategy_ScaleUp(t *testing.T) { // run upgrade the first time, expect we scale up needingUpgrade := collections.FromMachineList(initialMachine) controlPlane.Machines = needingUpgrade - result, err = r.upgradeControlPlane(ctx, controlPlane, needingUpgrade) + machinesUpToDateResults := map[string]internal.UpToDateResult{} + for _, m := range needingUpgrade { + machinesUpToDateResults[m.Name] = internal.UpToDateResult{EligibleForInPlaceUpdate: false} + } + result, err = r.updateControlPlane(ctx, controlPlane, needingUpgrade, machinesUpToDateResults) g.Expect(result).To(BeComparableTo(ctrl.Result{Requeue: true})) g.Expect(err).ToNot(HaveOccurred()) bothMachines := &clusterv1.MachineList{} @@ -167,9 +176,13 @@ func TestKubeadmControlPlaneReconciler_RolloutStrategy_ScaleUp(t *testing.T) { machinesRequireUpgrade[bothMachines.Items[i].Name] = &bothMachines.Items[i] } } + machinesUpToDateResults = map[string]internal.UpToDateResult{} + for _, m := range machinesRequireUpgrade { + machinesUpToDateResults[m.Name] = internal.UpToDateResult{EligibleForInPlaceUpdate: false} + } // run upgrade the second time, expect we scale down - result, err = r.upgradeControlPlane(ctx, controlPlane, machinesRequireUpgrade) + result, err = r.updateControlPlane(ctx, controlPlane, machinesRequireUpgrade, machinesUpToDateResults) g.Expect(err).ToNot(HaveOccurred()) g.Expect(result).To(BeComparableTo(ctrl.Result{Requeue: true})) finalMachine := &clusterv1.MachineList{} @@ -260,8 +273,11 @@ func TestKubeadmControlPlaneReconciler_RolloutStrategy_ScaleDown(t *testing.T) { // run upgrade, expect we scale down needingUpgrade := collections.FromMachineList(machineList) controlPlane.Machines = needingUpgrade - - result, err = r.upgradeControlPlane(ctx, controlPlane, needingUpgrade) + machinesUpToDateResults := map[string]internal.UpToDateResult{} + for _, m := range needingUpgrade { + machinesUpToDateResults[m.Name] = internal.UpToDateResult{EligibleForInPlaceUpdate: false} + } + result, err = r.updateControlPlane(ctx, controlPlane, needingUpgrade, machinesUpToDateResults) g.Expect(result).To(BeComparableTo(ctrl.Result{Requeue: true})) g.Expect(err).ToNot(HaveOccurred()) remainingMachines := &clusterv1.MachineList{} @@ -269,6 +285,227 @@ func TestKubeadmControlPlaneReconciler_RolloutStrategy_ScaleDown(t *testing.T) { g.Expect(remainingMachines.Items).To(HaveLen(2)) } +func Test_rollingUpdate(t *testing.T) { + tests := []struct { + name string + maxSurge int32 + currentReplicas int32 + currentUpToDateReplicas int32 + desiredReplicas int32 + enableInPlaceUpdatesFeatureGate bool + machineEligibleForInPlaceUpdate bool + tryInPlaceUpdateFunc func(ctx context.Context, controlPlane *internal.ControlPlane, machineToInPlaceUpdate *clusterv1.Machine, machineUpToDateResult internal.UpToDateResult) (bool, ctrl.Result, error) + wantTryInPlaceUpdateCalled bool + wantScaleDownCalled bool + wantScaleUpCalled bool + wantError bool + wantErrorMessage string + wantRes ctrl.Result + }{ + // Regular rollout (no in-place updates) + { + name: "Regular rollout: maxSurge 1: scale up", + maxSurge: 1, + currentReplicas: 3, + currentUpToDateReplicas: 0, + desiredReplicas: 3, + wantScaleUpCalled: true, + }, + { + name: "Regular rollout: maxSurge 1: scale down", + maxSurge: 1, + currentReplicas: 4, + currentUpToDateReplicas: 1, + desiredReplicas: 3, + wantScaleDownCalled: true, + }, + { + name: "Regular rollout: maxSurge 0: scale down", + maxSurge: 0, + currentReplicas: 3, + currentUpToDateReplicas: 0, + desiredReplicas: 3, + wantScaleDownCalled: true, + }, + { + name: "Regular rollout: maxSurge 0: scale up", + maxSurge: 0, + currentReplicas: 2, + currentUpToDateReplicas: 0, + desiredReplicas: 3, + wantScaleUpCalled: true, + }, + // In-place updates + // Note: maxSurge 0 or 1 doesn't have an impact on the in-place code path so not testing permutations here. + // Note: Scale up works the same way as for regular rollouts so not testing it here again. + // + // In-place updates: tryInPlaceUpdate not called + { + name: "In-place updates: feature gate disabled: scale down (tryInPlaceUpdate not called)", + maxSurge: 0, + currentReplicas: 3, + currentUpToDateReplicas: 0, + desiredReplicas: 3, + enableInPlaceUpdatesFeatureGate: false, + wantTryInPlaceUpdateCalled: false, + wantScaleDownCalled: true, + }, + { + name: "In-place updates: Machine not eligible for in-place: scale down (tryInPlaceUpdate not called)", + maxSurge: 0, + currentReplicas: 3, + currentUpToDateReplicas: 0, + desiredReplicas: 3, + enableInPlaceUpdatesFeatureGate: true, + machineEligibleForInPlaceUpdate: false, + wantTryInPlaceUpdateCalled: false, + wantScaleDownCalled: true, + }, + { + name: "In-place updates: already enough up-to-date replicas: scale down (tryInPlaceUpdate not called)", + maxSurge: 1, + currentReplicas: 4, + currentUpToDateReplicas: 3, + desiredReplicas: 3, + enableInPlaceUpdatesFeatureGate: true, + machineEligibleForInPlaceUpdate: true, + wantTryInPlaceUpdateCalled: false, + wantScaleDownCalled: true, + }, + // In-place updates: tryInPlaceUpdate called + { + name: "In-place updates: tryInPlaceUpdate returns error", + maxSurge: 0, + currentReplicas: 3, + currentUpToDateReplicas: 0, + desiredReplicas: 3, + enableInPlaceUpdatesFeatureGate: true, + machineEligibleForInPlaceUpdate: true, + tryInPlaceUpdateFunc: func(_ context.Context, _ *internal.ControlPlane, _ *clusterv1.Machine, _ internal.UpToDateResult) (bool, ctrl.Result, error) { + return false, ctrl.Result{}, errors.New("in-place update error") + }, + wantTryInPlaceUpdateCalled: true, + wantScaleDownCalled: false, + wantError: true, + wantErrorMessage: "in-place update error", + }, + { + name: "In-place updates: tryInPlaceUpdate returns Requeue", + maxSurge: 0, + currentReplicas: 3, + currentUpToDateReplicas: 0, + desiredReplicas: 3, + enableInPlaceUpdatesFeatureGate: true, + machineEligibleForInPlaceUpdate: true, + tryInPlaceUpdateFunc: func(_ context.Context, _ *internal.ControlPlane, _ *clusterv1.Machine, _ internal.UpToDateResult) (fallbackToScaleDown bool, _ ctrl.Result, _ error) { + return false, ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, nil + }, + wantTryInPlaceUpdateCalled: true, + wantScaleDownCalled: false, + wantRes: ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, + }, + { + name: "In-place updates: tryInPlaceUpdate returns fallback to scale down", + maxSurge: 0, + currentReplicas: 3, + currentUpToDateReplicas: 0, + desiredReplicas: 3, + enableInPlaceUpdatesFeatureGate: true, + machineEligibleForInPlaceUpdate: true, + tryInPlaceUpdateFunc: func(_ context.Context, _ *internal.ControlPlane, _ *clusterv1.Machine, _ internal.UpToDateResult) (fallbackToScaleDown bool, _ ctrl.Result, _ error) { + return true, ctrl.Result{}, nil + }, + wantTryInPlaceUpdateCalled: true, + wantScaleDownCalled: true, + }, + { + name: "In-place updates: tryInPlaceUpdate returns nothing (in-place update triggered)", + maxSurge: 0, + currentReplicas: 3, + currentUpToDateReplicas: 0, + desiredReplicas: 3, + enableInPlaceUpdatesFeatureGate: true, + machineEligibleForInPlaceUpdate: true, + tryInPlaceUpdateFunc: func(_ context.Context, _ *internal.ControlPlane, _ *clusterv1.Machine, _ internal.UpToDateResult) (fallbackToScaleDown bool, _ ctrl.Result, _ error) { + return false, ctrl.Result{}, nil + }, + wantTryInPlaceUpdateCalled: true, + wantScaleDownCalled: false, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + + if tt.enableInPlaceUpdatesFeatureGate { + utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.InPlaceUpdates, true) + } + + var inPlaceUpdateCalled bool + var scaleDownCalled bool + var scaleUpCalled bool + r := &KubeadmControlPlaneReconciler{ + overrideTryInPlaceUpdateFunc: func(ctx context.Context, controlPlane *internal.ControlPlane, machineToInPlaceUpdate *clusterv1.Machine, machineUpToDateResult internal.UpToDateResult) (bool, ctrl.Result, error) { + inPlaceUpdateCalled = true + return tt.tryInPlaceUpdateFunc(ctx, controlPlane, machineToInPlaceUpdate, machineUpToDateResult) + }, + overrideScaleDownControlPlaneFunc: func(_ context.Context, _ *internal.ControlPlane, _ *clusterv1.Machine) (ctrl.Result, error) { + scaleDownCalled = true + return ctrl.Result{}, nil + }, + overrideScaleUpControlPlaneFunc: func(_ context.Context, _ *internal.ControlPlane) (ctrl.Result, error) { + scaleUpCalled = true + return ctrl.Result{}, nil + }, + } + + machines := collections.Machines{} + for i := range tt.currentReplicas { + machines[fmt.Sprintf("machine-%d", i)] = machine(fmt.Sprintf("machine-%d", i)) + } + machinesUpToDate := collections.Machines{} + for i := range tt.currentUpToDateReplicas { + machinesUpToDate[fmt.Sprintf("machine-%d", i)] = machine(fmt.Sprintf("machine-%d", i)) + } + + controlPlane := &internal.ControlPlane{ + KCP: &controlplanev1.KubeadmControlPlane{ + Spec: controlplanev1.KubeadmControlPlaneSpec{ + Replicas: ptr.To(tt.desiredReplicas), + Rollout: controlplanev1.KubeadmControlPlaneRolloutSpec{ + Strategy: controlplanev1.KubeadmControlPlaneRolloutStrategy{ + RollingUpdate: controlplanev1.KubeadmControlPlaneRolloutStrategyRollingUpdate{ + MaxSurge: ptr.To(intstr.FromInt32(tt.maxSurge)), + }, + }, + }, + }, + }, + Cluster: &clusterv1.Cluster{}, + Machines: machines, + MachinesNotUpToDate: machines.Difference(machinesUpToDate), + } + machinesNeedingRollout, _ := controlPlane.MachinesNeedingRollout() + machinesUpToDateResults := map[string]internal.UpToDateResult{} + for _, m := range machinesNeedingRollout { + machinesUpToDateResults[m.Name] = internal.UpToDateResult{EligibleForInPlaceUpdate: tt.machineEligibleForInPlaceUpdate} + } + res, err := r.rollingUpdate(ctx, controlPlane, machinesNeedingRollout, machinesUpToDateResults) + if tt.wantError { + g.Expect(err).To(HaveOccurred()) + g.Expect(err.Error()).To(Equal(tt.wantErrorMessage)) + } else { + g.Expect(err).ToNot(HaveOccurred()) + } + g.Expect(res).To(Equal(tt.wantRes)) + + g.Expect(inPlaceUpdateCalled).To(Equal(tt.wantTryInPlaceUpdateCalled), "inPlaceUpdateCalled: actual: %t expected: %t", inPlaceUpdateCalled, tt.wantTryInPlaceUpdateCalled) + g.Expect(scaleDownCalled).To(Equal(tt.wantScaleDownCalled), "scaleDownCalled: actual: %t expected: %t", scaleDownCalled, tt.wantScaleDownCalled) + g.Expect(scaleUpCalled).To(Equal(tt.wantScaleUpCalled), "scaleUpCalled: actual: %t expected: %t", scaleUpCalled, tt.wantScaleUpCalled) + }) + } +} + type machineOpt func(*clusterv1.Machine) func machine(name string, opts ...machineOpt) *clusterv1.Machine { diff --git a/controlplane/kubeadm/internal/filters.go b/controlplane/kubeadm/internal/filters.go index 042a5c59123d..a0e986c9359d 100644 --- a/controlplane/kubeadm/internal/filters.go +++ b/controlplane/kubeadm/internal/filters.go @@ -35,8 +35,8 @@ import ( "sigs.k8s.io/cluster-api/util/collections" ) -// NotUpToDateResult is the result of calling the UpToDate func for a Machine. -type NotUpToDateResult struct { +// UpToDateResult is the result of calling the UpToDate func for a Machine. +type UpToDateResult struct { LogMessages []string ConditionMessages []string EligibleForInPlaceUpdate bool @@ -58,13 +58,18 @@ func UpToDate( reconciliationTime *metav1.Time, infraMachines map[string]*unstructured.Unstructured, kubeadmConfigs map[string]*bootstrapv1.KubeadmConfig, -) (bool, *NotUpToDateResult, error) { - res := &NotUpToDateResult{ +) (bool, *UpToDateResult, error) { + res := &UpToDateResult{ // A Machine is eligible for in-place update except if we find a reason why it shouldn't be, // e.g. rollout.before, rollout.after or the Machine is already up-to-date. EligibleForInPlaceUpdate: true, } + // If a Machine is marked for deletion it is not eligible for in-place update. + if _, ok := machine.Annotations[clusterv1.DeleteMachineAnnotation]; ok { + res.EligibleForInPlaceUpdate = false + } + // Machines whose certificates are about to expire. if collections.ShouldRolloutBefore(reconciliationTime, kcp.Spec.Rollout.Before)(machine) { res.LogMessages = append(res.LogMessages, "certificates will expire soon, rolloutBefore expired") @@ -116,7 +121,7 @@ func matchesMachineSpec( kcp *controlplanev1.KubeadmControlPlane, cluster *clusterv1.Cluster, machine *clusterv1.Machine, - res *NotUpToDateResult, + res *UpToDateResult, ) (bool, []string, []string, error) { logMessages := []string{} conditionMessages := []string{} diff --git a/controlplane/kubeadm/internal/filters_test.go b/controlplane/kubeadm/internal/filters_test.go index 81dd15a85947..1251b3697720 100644 --- a/controlplane/kubeadm/internal/filters_test.go +++ b/controlplane/kubeadm/internal/filters_test.go @@ -1604,7 +1604,11 @@ func TestUpToDate(t *testing.T) { kcp.Spec.Version = "v1.30.2" return kcp }(), - machine: defaultMachine, // defaultMachine has "v1.31.0" + machine: func() *clusterv1.Machine { + machine := defaultMachine.DeepCopy() + machine.Spec.Version = "v1.30.0" + return machine + }(), infraConfigs: defaultInfraConfigs, machineConfigs: defaultMachineConfigs, expectUptoDate: false, @@ -1612,6 +1616,28 @@ func TestUpToDate(t *testing.T) { expectLogMessages: []string{"Machine version \"v1.30.0\" is not equal to KCP version \"v1.30.2\""}, expectConditionMessages: []string{"Version v1.30.0, v1.30.2 required"}, }, + { + name: "kubernetes version does not match + delete annotation", + kcp: func() *controlplanev1.KubeadmControlPlane { + kcp := defaultKcp.DeepCopy() + kcp.Spec.Version = "v1.30.2" + return kcp + }(), + machine: func() *clusterv1.Machine { + machine := defaultMachine.DeepCopy() + machine.Spec.Version = "v1.30.0" + machine.Annotations = map[string]string{ + clusterv1.DeleteMachineAnnotation: "", + } + return machine + }(), + infraConfigs: defaultInfraConfigs, + machineConfigs: defaultMachineConfigs, + expectUptoDate: false, + expectEligibleForInPlaceUpdate: false, // Not eligible for in-place update because of delete annotation. + expectLogMessages: []string{"Machine version \"v1.30.0\" is not equal to KCP version \"v1.30.2\""}, + expectConditionMessages: []string{"Version v1.30.0, v1.30.2 required"}, + }, { name: "KubeadmConfig is not up-to-date", kcp: func() *controlplanev1.KubeadmControlPlane {