diff --git a/pkg/resource/cluster/hook.go b/pkg/resource/cluster/hook.go index a0b647d..943d188 100644 --- a/pkg/resource/cluster/hook.go +++ b/pkg/resource/cluster/hook.go @@ -184,6 +184,45 @@ func (rm *resourceManager) clusterInUse(ctx context.Context, r *resource) (bool, return (nodes != nil && len(nodes.Nodegroups) > 0), nil } +// isAutoModeCluster returns true if the cluster is configured for EKS Auto Mode. +// According to AWS documentation, compute, block storage, and load balancing capabilities +// must all be enabled or disabled together. Any partial configuration is invalid. +// Returns an error for invalid configurations. +func isAutoModeCluster(r *resource) (bool, error) { + if r == nil || r.ko == nil { + return false, nil + } + + hasComputeConfig := r.ko.Spec.ComputeConfig != nil + hasStorageConfig := r.ko.Spec.StorageConfig != nil && r.ko.Spec.StorageConfig.BlockStorage != nil + hasELBConfig := r.ko.Spec.KubernetesNetworkConfig != nil && r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing != nil + + // If no Auto Mode configuration is present, it's valid (not an Auto Mode cluster) + if !hasComputeConfig && !hasStorageConfig && !hasELBConfig { + return false, nil + } + + // If any Auto Mode configuration is present, ALL must be present + if !hasComputeConfig || !hasStorageConfig || !hasELBConfig { + return false, fmt.Errorf("invalid Auto Mode configuration: when configuring Auto Mode, all three capabilities must be specified (compute=%v, storage=%v, elb=%v)", + hasComputeConfig, hasStorageConfig, hasELBConfig) + } + + computeEnabled := r.ko.Spec.ComputeConfig.Enabled != nil && *r.ko.Spec.ComputeConfig.Enabled + storageEnabled := r.ko.Spec.StorageConfig.BlockStorage.Enabled != nil && *r.ko.Spec.StorageConfig.BlockStorage.Enabled + elbEnabled := r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled != nil && *r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled + + // All three must be in the same state + if computeEnabled != storageEnabled || storageEnabled != elbEnabled { + return false, fmt.Errorf("invalid Auto Mode configuration: compute, block storage, and load balancing capabilities must all be enabled or disabled together (compute=%v, storage=%v, elb=%v)", + computeEnabled, storageEnabled, elbEnabled) + } + + isAutoMode := (computeEnabled && storageEnabled && elbEnabled) || (!computeEnabled && !storageEnabled && !elbEnabled) + return isAutoMode, nil +} + + func customPreCompare( a *resource, b *resource, @@ -380,25 +419,38 @@ func (rm *resourceManager) customUpdate( return returnClusterUpdating(updatedRes) } - // Handle computeConfig updates + // Handle computeConfig updates - only for Auto Mode clusters if delta.DifferentAt("Spec.ComputeConfig") || delta.DifferentAt("Spec.StorageConfig") || delta.DifferentAt("Spec.KubernetesNetworkConfig") { - if err := rm.updateComputeConfig(ctx, desired); err != nil { - awsErr, ok := extractAWSError(err) - rlog.Info("attempting to update AutoMode config", - "error", err, - "isAWSError", ok, - "awsErrorCode", awsErr.Code) - - // Check to see if we've raced an async update call and need to requeue - if ok && awsErr.Code == "ResourceInUseException" { - rlog.Info("resource in use, requeueing after async update") - return nil, requeueAfterAsyncUpdate() + // Validate Auto Mode configuration and proceed only if cluster is configured for Auto Mode + isAutoMode, err := isAutoModeCluster(desired) + if err != nil { + return nil, ackerr.NewTerminalError(err) + } + if isAutoMode { + if err := rm.updateComputeConfig(ctx, desired); err != nil { + awsErr, ok := extractAWSError(err) + var awsErrorCode string + if ok && awsErr != nil { + awsErrorCode = awsErr.Code + } + rlog.Info("attempting to update AutoMode config", + "error", err, + "isAWSError", ok, + "awsErrorCode", awsErrorCode) + + // Check to see if we've raced an async update call and need to requeue + if ok && awsErr != nil && awsErr.Code == "ResourceInUseException" { + rlog.Info("resource in use, requeueing after async update") + return nil, requeueAfterAsyncUpdate() + } + + return nil, fmt.Errorf("failed to update AutoMode config: %w", err) } - return nil, fmt.Errorf("failed to update AutoMode config: %w", err) + return returnClusterUpdating(updatedRes) } - - return returnClusterUpdating(updatedRes) + // If not Auto Mode, ignore the diff + rlog.Info("ignoring diff on compute/storage/network config for non-Auto Mode cluster") } // Handle zonalShiftConfig updates diff --git a/test/e2e/tests/test_cluster_automode.py b/test/e2e/tests/test_cluster_automode.py index 831d664..5a077f9 100644 --- a/test/e2e/tests/test_cluster_automode.py +++ b/test/e2e/tests/test_cluster_automode.py @@ -18,6 +18,7 @@ import logging import time import pytest +import json from acktest.k8s import resource as k8s from acktest.k8s import condition @@ -33,9 +34,10 @@ from e2e.common.types import CLUSTER_RESOURCE_PLURAL from e2e.common.waiter import wait_until_deleted from e2e.replacement_values import REPLACEMENT_VALUES +from e2e.tests.test_cluster import simple_cluster -MODIFY_WAIT_AFTER_SECONDS = 240 -CHECK_STATUS_WAIT_SECONDS = 240 +MODIFY_WAIT_AFTER_SECONDS = 60 +CHECK_STATUS_WAIT_SECONDS = 30 def wait_for_cluster_active(eks_client, cluster_name): @@ -93,8 +95,13 @@ def auto_mode_cluster(eks_client): yield (ref, cr) - pass - + # Try to delete, if doesn't already exist + try: + _, deleted = k8s.delete_custom_resource(ref, 9, 10) + assert deleted + wait_until_deleted(cluster_name) + except Exception: + pass @service_marker @pytest.mark.canary @@ -141,6 +148,164 @@ def test_create_auto_mode_cluster(self, eks_client, auto_mode_cluster): time.sleep(CHECK_STATUS_WAIT_SECONDS) # Clean up - _, deleted = k8s.delete_custom_resource(ref, 3, 10) + _, deleted = k8s.delete_custom_resource(ref, 9, 10) assert deleted wait_until_deleted(cluster_name) + + +@service_marker +@pytest.mark.canary +class TestAutoModeClusterUpdates: + def test_enable_auto_mode_on_standard_cluster(self, eks_client, simple_cluster): + (ref, cr) = simple_cluster + cluster_name = cr["spec"]["name"] + + aws_res = eks_client.describe_cluster(name=cluster_name) + assert aws_res is not None + + # Wait for the cluster to be ACTIVE and let controller refresh status + wait_for_cluster_active(eks_client, cluster_name) + time.sleep(CHECK_STATUS_WAIT_SECONDS) + get_and_assert_status(ref, "ACTIVE", True) + + # Patch to enable auto-mode + patch_enable_auto_mode = { + "spec": { + "computeConfig": {"enabled": True}, + "storageConfig": {"blockStorage": {"enabled": True}}, + "kubernetesNetworkConfig": { + "elasticLoadBalancing": {"enabled": True}, + "ipFamily": "ipv4", + }, + } + } + k8s.patch_custom_resource(ref, patch_enable_auto_mode) + time.sleep(MODIFY_WAIT_AFTER_SECONDS) + get_and_assert_status(ref, "UPDATING", False) + + # Wait for cluster to become active after update + wait_for_cluster_active(eks_client, cluster_name) + time.sleep(CHECK_STATUS_WAIT_SECONDS) + get_and_assert_status(ref, "ACTIVE", True) + + # Verify auto-mode activation via EKS update history (since DescribeCluster may not reflect the fields immediately) + updates_summary = eks_client.list_updates(name=cluster_name) + + update_ids = updates_summary.get("updateIds", []) + assert len(update_ids) == 1, ( + f"Expected exactly 1 update, got {len(update_ids)}: {update_ids}" + ) + + update_id = update_ids[0] + upd_desc = eks_client.describe_update(name=cluster_name, updateId=update_id) + + update_info = upd_desc["update"] + + # Verify update type and status + assert update_info["type"] == "AutoModeUpdate", ( + f"Expected AutoModeUpdate, got: {update_info['type']}" + ) + assert update_info["status"] == "Successful", ( + f"Expected Successful status, got: {update_info['status']}" + ) + + def test_disable_auto_mode_incorrectly(self, eks_client, auto_mode_cluster): + (ref, cr) = auto_mode_cluster + cluster_name = cr["spec"]["name"] + + try: + aws_res = eks_client.describe_cluster(name=cluster_name) + assert aws_res is not None + except eks_client.exceptions.ResourceNotFoundException: + pytest.fail(f"Could not find cluster '{cluster_name}' in EKS") + + wait_for_cluster_active(eks_client, cluster_name) + time.sleep(CHECK_STATUS_WAIT_SECONDS) + get_and_assert_status(ref, "ACTIVE", True) + + # Patch with incorrect parameters to disable auto-mode + patch_disable_auto_mode_incorrectly = { + "spec": { + "computeConfig": {"enabled": False}, + "storageConfig": { + "blockStorage": { + "enabled": True # Should be False + } + }, + "kubernetesNetworkConfig": {"elasticLoadBalancing": {"enabled": False}}, + } + } + + k8s.patch_custom_resource(ref, patch_disable_auto_mode_incorrectly) + time.sleep(MODIFY_WAIT_AFTER_SECONDS) + + # The controller should detect the invalid configuration and set a terminal condition. + terminal_condition = "ACK.Terminal" + cond = k8s.get_resource_condition(ref, terminal_condition) + if cond is None: + pytest.fail( + f"Failed to find {terminal_condition} condition in resource {ref}" + ) + + cond_status = cond.get("status", None) + if str(cond_status) != str(True): + pytest.fail( + f"Expected {terminal_condition} condition to have status True but found {cond_status}" + ) + + # Verify the error message contains information about invalid Auto Mode configuration + assert "invalid Auto Mode configuration" in cond.get("message", "") + + def test_disable_auto_mode_correctly(self, eks_client, auto_mode_cluster): + (ref, cr) = auto_mode_cluster + cluster_name = cr["spec"]["name"] + + try: + aws_res = eks_client.describe_cluster(name=cluster_name) + assert aws_res is not None + except eks_client.exceptions.ResourceNotFoundException: + pytest.fail(f"Could not find cluster '{cluster_name}' in EKS") + + wait_for_cluster_active(eks_client, cluster_name) + time.sleep(CHECK_STATUS_WAIT_SECONDS) + get_and_assert_status(ref, "ACTIVE", True) + + # Patch to disable auto-mode correctly + patch_disable_auto_mode = { + "spec": { + "computeConfig": {"enabled": False}, + "storageConfig": {"blockStorage": {"enabled": False}}, + "kubernetesNetworkConfig": {"elasticLoadBalancing": {"enabled": False}}, + } + } + + k8s.patch_custom_resource(ref, patch_disable_auto_mode) + time.sleep(MODIFY_WAIT_AFTER_SECONDS ) + get_and_assert_status(ref, "UPDATING", False) + + wait_for_cluster_active(eks_client, cluster_name) + time.sleep(CHECK_STATUS_WAIT_SECONDS) + get_and_assert_status(ref, "ACTIVE", True) + + # Verify auto-mode is disabled + aws_res = eks_client.describe_cluster(name=cluster_name) + compute_config = aws_res["cluster"].get("computeConfig") + if compute_config is not None: + assert compute_config.get("enabled") is False, ( + f"computeConfig.enabled should be False or absent, got: {compute_config.get('enabled')}" + ) + + storage_config = aws_res["cluster"].get("storageConfig") + if storage_config is not None: + block_storage = storage_config.get("blockStorage", {}) + if block_storage: + assert block_storage.get("enabled") is False, ( + f"storageConfig.blockStorage.enabled should be False or absent, got: {block_storage.get('enabled')}" + ) + + k8s_network_config = aws_res["cluster"].get("kubernetesNetworkConfig", {}) + elb_config = k8s_network_config.get("elasticLoadBalancing") + if elb_config is not None: + assert elb_config.get("enabled") is False, ( + f"kubernetesNetworkConfig.elasticLoadBalancing.enabled should be False or absent, got: {elb_config.get('enabled')}" + )