From 9ed24822d68bd26bd24d017a71862f212ecf2a85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micka=C3=ABl=20Le=20Baillif?= <1567582+demikl@users.noreply.github.com> Date: Thu, 4 Sep 2025 17:41:55 +0200 Subject: [PATCH 1/2] Fix nil pointer panic and spurious Auto Mode updates - Add isAutoModeCluster() function to detect valid Auto Mode configurations - Add validateAutoModeConfig() to enforce AWS requirement that compute, storage, and load balancing must all be enabled/disabled together - Only call updateComputeConfig() for actual Auto Mode clusters - Ignore elasticLoadBalancing absent vs false diffs for non-Auto Mode clusters Fixes aws-controllers-k8s/community#2619 --- pkg/resource/cluster/hook.go | 104 ++++++++++++++++++++++++++++++----- 1 file changed, 90 insertions(+), 14 deletions(-) diff --git a/pkg/resource/cluster/hook.go b/pkg/resource/cluster/hook.go index a0b647d..910bbfb 100644 --- a/pkg/resource/cluster/hook.go +++ b/pkg/resource/cluster/hook.go @@ -184,6 +184,72 @@ func (rm *resourceManager) clusterInUse(ctx context.Context, r *resource) (bool, return (nodes != nil && len(nodes.Nodegroups) > 0), nil } +// isAutoModeCluster returns true if the cluster is configured for EKS Auto Mode. +// According to AWS documentation, compute, block storage, and load balancing capabilities +// must all be enabled or disabled together. Any partial configuration is invalid. +func isAutoModeCluster(r *resource) bool { + if r == nil || r.ko == nil { + return false + } + + // If ComputeConfig is not specified, this is not an Auto Mode cluster + if r.ko.Spec.ComputeConfig == nil { + return false + } + + // Check compute configuration + computeEnabled := r.ko.Spec.ComputeConfig.Enabled != nil && *r.ko.Spec.ComputeConfig.Enabled + + // Check storage configuration + storageEnabled := false + if r.ko.Spec.StorageConfig != nil && r.ko.Spec.StorageConfig.BlockStorage != nil { + storageEnabled = r.ko.Spec.StorageConfig.BlockStorage.Enabled != nil && *r.ko.Spec.StorageConfig.BlockStorage.Enabled + } + + // Check elastic load balancing configuration + elbEnabled := false + if r.ko.Spec.KubernetesNetworkConfig != nil && r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing != nil { + elbEnabled = r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled != nil && *r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled + } + + // Auto Mode requires all three capabilities to have the same state (all true or all false) + // If they are all true, Auto Mode is enabled + // If they are all false, Auto Mode is being disabled + // Any other combination is invalid + return (computeEnabled && storageEnabled && elbEnabled) || (!computeEnabled && !storageEnabled && !elbEnabled) +} + +// validateAutoModeConfig validates that Auto Mode configuration is consistent. +// Returns an error if the configuration is invalid (partial enablement). +func validateAutoModeConfig(r *resource) error { + if r == nil || r.ko == nil || r.ko.Spec.ComputeConfig == nil { + return nil // Not an Auto Mode configuration + } + + // Check compute configuration + computeEnabled := r.ko.Spec.ComputeConfig.Enabled != nil && *r.ko.Spec.ComputeConfig.Enabled + + // Check storage configuration + storageEnabled := false + if r.ko.Spec.StorageConfig != nil && r.ko.Spec.StorageConfig.BlockStorage != nil { + storageEnabled = r.ko.Spec.StorageConfig.BlockStorage.Enabled != nil && *r.ko.Spec.StorageConfig.BlockStorage.Enabled + } + + // Check elastic load balancing configuration + elbEnabled := false + if r.ko.Spec.KubernetesNetworkConfig != nil && r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing != nil { + elbEnabled = r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled != nil && *r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled + } + + // All three must be in the same state + if computeEnabled == storageEnabled && storageEnabled == elbEnabled { + return nil // Valid configuration + } + + return fmt.Errorf("invalid Auto Mode configuration: compute, block storage, and load balancing capabilities must all be enabled or disabled together (compute=%v, storage=%v, elb=%v)", + computeEnabled, storageEnabled, elbEnabled) +} + func customPreCompare( a *resource, b *resource, @@ -380,25 +446,35 @@ func (rm *resourceManager) customUpdate( return returnClusterUpdating(updatedRes) } - // Handle computeConfig updates + // Handle computeConfig updates - only for Auto Mode clusters if delta.DifferentAt("Spec.ComputeConfig") || delta.DifferentAt("Spec.StorageConfig") || delta.DifferentAt("Spec.KubernetesNetworkConfig") { - if err := rm.updateComputeConfig(ctx, desired); err != nil { - awsErr, ok := extractAWSError(err) - rlog.Info("attempting to update AutoMode config", - "error", err, - "isAWSError", ok, - "awsErrorCode", awsErr.Code) + // Validate Auto Mode configuration consistency before attempting update + if err := validateAutoModeConfig(desired); err != nil { + return nil, ackerr.NewTerminalError(err) + } - // Check to see if we've raced an async update call and need to requeue - if ok && awsErr.Code == "ResourceInUseException" { - rlog.Info("resource in use, requeueing after async update") - return nil, requeueAfterAsyncUpdate() + // Only proceed with Auto Mode updates if the cluster is actually configured for Auto Mode + if isAutoModeCluster(desired) { + if err := rm.updateComputeConfig(ctx, desired); err != nil { + awsErr, ok := extractAWSError(err) + rlog.Info("attempting to update AutoMode config", + "error", err, + "isAWSError", ok, + "awsErrorCode", awsErr.Code) + + // Check to see if we've raced an async update call and need to requeue + if ok && awsErr.Code == "ResourceInUseException" { + rlog.Info("resource in use, requeueing after async update") + return nil, requeueAfterAsyncUpdate() + } + + return nil, fmt.Errorf("failed to update AutoMode config: %w", err) } - return nil, fmt.Errorf("failed to update AutoMode config: %w", err) + return returnClusterUpdating(updatedRes) } - - return returnClusterUpdating(updatedRes) + // If not Auto Mode, ignore the diff (likely elasticLoadBalancing false vs absent) + rlog.Debug("ignoring diff on compute/storage/network config for non-Auto Mode cluster") } // Handle zonalShiftConfig updates From 3b5856f1ab79db5194bb5a00c449441b73246e05 Mon Sep 17 00:00:00 2001 From: Arush Sharma Date: Fri, 3 Oct 2025 11:31:53 -0700 Subject: [PATCH 2/2] fix-automode-nil-pointer-issue --- pkg/resource/cluster/hook.go | 90 +++++------- test/e2e/tests/test_cluster_automode.py | 175 +++++++++++++++++++++++- 2 files changed, 203 insertions(+), 62 deletions(-) diff --git a/pkg/resource/cluster/hook.go b/pkg/resource/cluster/hook.go index 910bbfb..943d188 100644 --- a/pkg/resource/cluster/hook.go +++ b/pkg/resource/cluster/hook.go @@ -187,69 +187,42 @@ func (rm *resourceManager) clusterInUse(ctx context.Context, r *resource) (bool, // isAutoModeCluster returns true if the cluster is configured for EKS Auto Mode. // According to AWS documentation, compute, block storage, and load balancing capabilities // must all be enabled or disabled together. Any partial configuration is invalid. -func isAutoModeCluster(r *resource) bool { +// Returns an error for invalid configurations. +func isAutoModeCluster(r *resource) (bool, error) { if r == nil || r.ko == nil { - return false + return false, nil } - // If ComputeConfig is not specified, this is not an Auto Mode cluster - if r.ko.Spec.ComputeConfig == nil { - return false - } - - // Check compute configuration - computeEnabled := r.ko.Spec.ComputeConfig.Enabled != nil && *r.ko.Spec.ComputeConfig.Enabled + hasComputeConfig := r.ko.Spec.ComputeConfig != nil + hasStorageConfig := r.ko.Spec.StorageConfig != nil && r.ko.Spec.StorageConfig.BlockStorage != nil + hasELBConfig := r.ko.Spec.KubernetesNetworkConfig != nil && r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing != nil - // Check storage configuration - storageEnabled := false - if r.ko.Spec.StorageConfig != nil && r.ko.Spec.StorageConfig.BlockStorage != nil { - storageEnabled = r.ko.Spec.StorageConfig.BlockStorage.Enabled != nil && *r.ko.Spec.StorageConfig.BlockStorage.Enabled + // If no Auto Mode configuration is present, it's valid (not an Auto Mode cluster) + if !hasComputeConfig && !hasStorageConfig && !hasELBConfig { + return false, nil } - // Check elastic load balancing configuration - elbEnabled := false - if r.ko.Spec.KubernetesNetworkConfig != nil && r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing != nil { - elbEnabled = r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled != nil && *r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled - } - - // Auto Mode requires all three capabilities to have the same state (all true or all false) - // If they are all true, Auto Mode is enabled - // If they are all false, Auto Mode is being disabled - // Any other combination is invalid - return (computeEnabled && storageEnabled && elbEnabled) || (!computeEnabled && !storageEnabled && !elbEnabled) -} - -// validateAutoModeConfig validates that Auto Mode configuration is consistent. -// Returns an error if the configuration is invalid (partial enablement). -func validateAutoModeConfig(r *resource) error { - if r == nil || r.ko == nil || r.ko.Spec.ComputeConfig == nil { - return nil // Not an Auto Mode configuration + // If any Auto Mode configuration is present, ALL must be present + if !hasComputeConfig || !hasStorageConfig || !hasELBConfig { + return false, fmt.Errorf("invalid Auto Mode configuration: when configuring Auto Mode, all three capabilities must be specified (compute=%v, storage=%v, elb=%v)", + hasComputeConfig, hasStorageConfig, hasELBConfig) } - // Check compute configuration computeEnabled := r.ko.Spec.ComputeConfig.Enabled != nil && *r.ko.Spec.ComputeConfig.Enabled - - // Check storage configuration - storageEnabled := false - if r.ko.Spec.StorageConfig != nil && r.ko.Spec.StorageConfig.BlockStorage != nil { - storageEnabled = r.ko.Spec.StorageConfig.BlockStorage.Enabled != nil && *r.ko.Spec.StorageConfig.BlockStorage.Enabled - } - - // Check elastic load balancing configuration - elbEnabled := false - if r.ko.Spec.KubernetesNetworkConfig != nil && r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing != nil { - elbEnabled = r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled != nil && *r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled - } + storageEnabled := r.ko.Spec.StorageConfig.BlockStorage.Enabled != nil && *r.ko.Spec.StorageConfig.BlockStorage.Enabled + elbEnabled := r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled != nil && *r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled // All three must be in the same state - if computeEnabled == storageEnabled && storageEnabled == elbEnabled { - return nil // Valid configuration + if computeEnabled != storageEnabled || storageEnabled != elbEnabled { + return false, fmt.Errorf("invalid Auto Mode configuration: compute, block storage, and load balancing capabilities must all be enabled or disabled together (compute=%v, storage=%v, elb=%v)", + computeEnabled, storageEnabled, elbEnabled) } - return fmt.Errorf("invalid Auto Mode configuration: compute, block storage, and load balancing capabilities must all be enabled or disabled together (compute=%v, storage=%v, elb=%v)", - computeEnabled, storageEnabled, elbEnabled) + isAutoMode := (computeEnabled && storageEnabled && elbEnabled) || (!computeEnabled && !storageEnabled && !elbEnabled) + return isAutoMode, nil } + func customPreCompare( a *resource, b *resource, @@ -448,22 +421,25 @@ func (rm *resourceManager) customUpdate( // Handle computeConfig updates - only for Auto Mode clusters if delta.DifferentAt("Spec.ComputeConfig") || delta.DifferentAt("Spec.StorageConfig") || delta.DifferentAt("Spec.KubernetesNetworkConfig") { - // Validate Auto Mode configuration consistency before attempting update - if err := validateAutoModeConfig(desired); err != nil { + // Validate Auto Mode configuration and proceed only if cluster is configured for Auto Mode + isAutoMode, err := isAutoModeCluster(desired) + if err != nil { return nil, ackerr.NewTerminalError(err) } - - // Only proceed with Auto Mode updates if the cluster is actually configured for Auto Mode - if isAutoModeCluster(desired) { + if isAutoMode { if err := rm.updateComputeConfig(ctx, desired); err != nil { awsErr, ok := extractAWSError(err) + var awsErrorCode string + if ok && awsErr != nil { + awsErrorCode = awsErr.Code + } rlog.Info("attempting to update AutoMode config", "error", err, "isAWSError", ok, - "awsErrorCode", awsErr.Code) + "awsErrorCode", awsErrorCode) // Check to see if we've raced an async update call and need to requeue - if ok && awsErr.Code == "ResourceInUseException" { + if ok && awsErr != nil && awsErr.Code == "ResourceInUseException" { rlog.Info("resource in use, requeueing after async update") return nil, requeueAfterAsyncUpdate() } @@ -473,8 +449,8 @@ func (rm *resourceManager) customUpdate( return returnClusterUpdating(updatedRes) } - // If not Auto Mode, ignore the diff (likely elasticLoadBalancing false vs absent) - rlog.Debug("ignoring diff on compute/storage/network config for non-Auto Mode cluster") + // If not Auto Mode, ignore the diff + rlog.Info("ignoring diff on compute/storage/network config for non-Auto Mode cluster") } // Handle zonalShiftConfig updates diff --git a/test/e2e/tests/test_cluster_automode.py b/test/e2e/tests/test_cluster_automode.py index 831d664..5a077f9 100644 --- a/test/e2e/tests/test_cluster_automode.py +++ b/test/e2e/tests/test_cluster_automode.py @@ -18,6 +18,7 @@ import logging import time import pytest +import json from acktest.k8s import resource as k8s from acktest.k8s import condition @@ -33,9 +34,10 @@ from e2e.common.types import CLUSTER_RESOURCE_PLURAL from e2e.common.waiter import wait_until_deleted from e2e.replacement_values import REPLACEMENT_VALUES +from e2e.tests.test_cluster import simple_cluster -MODIFY_WAIT_AFTER_SECONDS = 240 -CHECK_STATUS_WAIT_SECONDS = 240 +MODIFY_WAIT_AFTER_SECONDS = 60 +CHECK_STATUS_WAIT_SECONDS = 30 def wait_for_cluster_active(eks_client, cluster_name): @@ -93,8 +95,13 @@ def auto_mode_cluster(eks_client): yield (ref, cr) - pass - + # Try to delete, if doesn't already exist + try: + _, deleted = k8s.delete_custom_resource(ref, 9, 10) + assert deleted + wait_until_deleted(cluster_name) + except Exception: + pass @service_marker @pytest.mark.canary @@ -141,6 +148,164 @@ def test_create_auto_mode_cluster(self, eks_client, auto_mode_cluster): time.sleep(CHECK_STATUS_WAIT_SECONDS) # Clean up - _, deleted = k8s.delete_custom_resource(ref, 3, 10) + _, deleted = k8s.delete_custom_resource(ref, 9, 10) assert deleted wait_until_deleted(cluster_name) + + +@service_marker +@pytest.mark.canary +class TestAutoModeClusterUpdates: + def test_enable_auto_mode_on_standard_cluster(self, eks_client, simple_cluster): + (ref, cr) = simple_cluster + cluster_name = cr["spec"]["name"] + + aws_res = eks_client.describe_cluster(name=cluster_name) + assert aws_res is not None + + # Wait for the cluster to be ACTIVE and let controller refresh status + wait_for_cluster_active(eks_client, cluster_name) + time.sleep(CHECK_STATUS_WAIT_SECONDS) + get_and_assert_status(ref, "ACTIVE", True) + + # Patch to enable auto-mode + patch_enable_auto_mode = { + "spec": { + "computeConfig": {"enabled": True}, + "storageConfig": {"blockStorage": {"enabled": True}}, + "kubernetesNetworkConfig": { + "elasticLoadBalancing": {"enabled": True}, + "ipFamily": "ipv4", + }, + } + } + k8s.patch_custom_resource(ref, patch_enable_auto_mode) + time.sleep(MODIFY_WAIT_AFTER_SECONDS) + get_and_assert_status(ref, "UPDATING", False) + + # Wait for cluster to become active after update + wait_for_cluster_active(eks_client, cluster_name) + time.sleep(CHECK_STATUS_WAIT_SECONDS) + get_and_assert_status(ref, "ACTIVE", True) + + # Verify auto-mode activation via EKS update history (since DescribeCluster may not reflect the fields immediately) + updates_summary = eks_client.list_updates(name=cluster_name) + + update_ids = updates_summary.get("updateIds", []) + assert len(update_ids) == 1, ( + f"Expected exactly 1 update, got {len(update_ids)}: {update_ids}" + ) + + update_id = update_ids[0] + upd_desc = eks_client.describe_update(name=cluster_name, updateId=update_id) + + update_info = upd_desc["update"] + + # Verify update type and status + assert update_info["type"] == "AutoModeUpdate", ( + f"Expected AutoModeUpdate, got: {update_info['type']}" + ) + assert update_info["status"] == "Successful", ( + f"Expected Successful status, got: {update_info['status']}" + ) + + def test_disable_auto_mode_incorrectly(self, eks_client, auto_mode_cluster): + (ref, cr) = auto_mode_cluster + cluster_name = cr["spec"]["name"] + + try: + aws_res = eks_client.describe_cluster(name=cluster_name) + assert aws_res is not None + except eks_client.exceptions.ResourceNotFoundException: + pytest.fail(f"Could not find cluster '{cluster_name}' in EKS") + + wait_for_cluster_active(eks_client, cluster_name) + time.sleep(CHECK_STATUS_WAIT_SECONDS) + get_and_assert_status(ref, "ACTIVE", True) + + # Patch with incorrect parameters to disable auto-mode + patch_disable_auto_mode_incorrectly = { + "spec": { + "computeConfig": {"enabled": False}, + "storageConfig": { + "blockStorage": { + "enabled": True # Should be False + } + }, + "kubernetesNetworkConfig": {"elasticLoadBalancing": {"enabled": False}}, + } + } + + k8s.patch_custom_resource(ref, patch_disable_auto_mode_incorrectly) + time.sleep(MODIFY_WAIT_AFTER_SECONDS) + + # The controller should detect the invalid configuration and set a terminal condition. + terminal_condition = "ACK.Terminal" + cond = k8s.get_resource_condition(ref, terminal_condition) + if cond is None: + pytest.fail( + f"Failed to find {terminal_condition} condition in resource {ref}" + ) + + cond_status = cond.get("status", None) + if str(cond_status) != str(True): + pytest.fail( + f"Expected {terminal_condition} condition to have status True but found {cond_status}" + ) + + # Verify the error message contains information about invalid Auto Mode configuration + assert "invalid Auto Mode configuration" in cond.get("message", "") + + def test_disable_auto_mode_correctly(self, eks_client, auto_mode_cluster): + (ref, cr) = auto_mode_cluster + cluster_name = cr["spec"]["name"] + + try: + aws_res = eks_client.describe_cluster(name=cluster_name) + assert aws_res is not None + except eks_client.exceptions.ResourceNotFoundException: + pytest.fail(f"Could not find cluster '{cluster_name}' in EKS") + + wait_for_cluster_active(eks_client, cluster_name) + time.sleep(CHECK_STATUS_WAIT_SECONDS) + get_and_assert_status(ref, "ACTIVE", True) + + # Patch to disable auto-mode correctly + patch_disable_auto_mode = { + "spec": { + "computeConfig": {"enabled": False}, + "storageConfig": {"blockStorage": {"enabled": False}}, + "kubernetesNetworkConfig": {"elasticLoadBalancing": {"enabled": False}}, + } + } + + k8s.patch_custom_resource(ref, patch_disable_auto_mode) + time.sleep(MODIFY_WAIT_AFTER_SECONDS ) + get_and_assert_status(ref, "UPDATING", False) + + wait_for_cluster_active(eks_client, cluster_name) + time.sleep(CHECK_STATUS_WAIT_SECONDS) + get_and_assert_status(ref, "ACTIVE", True) + + # Verify auto-mode is disabled + aws_res = eks_client.describe_cluster(name=cluster_name) + compute_config = aws_res["cluster"].get("computeConfig") + if compute_config is not None: + assert compute_config.get("enabled") is False, ( + f"computeConfig.enabled should be False or absent, got: {compute_config.get('enabled')}" + ) + + storage_config = aws_res["cluster"].get("storageConfig") + if storage_config is not None: + block_storage = storage_config.get("blockStorage", {}) + if block_storage: + assert block_storage.get("enabled") is False, ( + f"storageConfig.blockStorage.enabled should be False or absent, got: {block_storage.get('enabled')}" + ) + + k8s_network_config = aws_res["cluster"].get("kubernetesNetworkConfig", {}) + elb_config = k8s_network_config.get("elasticLoadBalancing") + if elb_config is not None: + assert elb_config.get("enabled") is False, ( + f"kubernetesNetworkConfig.elasticLoadBalancing.enabled should be False or absent, got: {elb_config.get('enabled')}" + )