Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 67 additions & 15 deletions pkg/resource/cluster/hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,45 @@ func (rm *resourceManager) clusterInUse(ctx context.Context, r *resource) (bool,
return (nodes != nil && len(nodes.Nodegroups) > 0), nil
}

// isAutoModeCluster returns true if the cluster is configured for EKS Auto Mode.
// According to AWS documentation, compute, block storage, and load balancing capabilities
// must all be enabled or disabled together. Any partial configuration is invalid.
// Returns an error for invalid configurations.
func isAutoModeCluster(r *resource) (bool, error) {
if r == nil || r.ko == nil {
return false, nil
}

hasComputeConfig := r.ko.Spec.ComputeConfig != nil
hasStorageConfig := r.ko.Spec.StorageConfig != nil && r.ko.Spec.StorageConfig.BlockStorage != nil
hasELBConfig := r.ko.Spec.KubernetesNetworkConfig != nil && r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing != nil

// If no Auto Mode configuration is present, it's valid (not an Auto Mode cluster)
if !hasComputeConfig && !hasStorageConfig && !hasELBConfig {
return false, nil
}

// If any Auto Mode configuration is present, ALL must be present
if !hasComputeConfig || !hasStorageConfig || !hasELBConfig {
return false, fmt.Errorf("invalid Auto Mode configuration: when configuring Auto Mode, all three capabilities must be specified (compute=%v, storage=%v, elb=%v)",
hasComputeConfig, hasStorageConfig, hasELBConfig)
}

computeEnabled := r.ko.Spec.ComputeConfig.Enabled != nil && *r.ko.Spec.ComputeConfig.Enabled
storageEnabled := r.ko.Spec.StorageConfig.BlockStorage.Enabled != nil && *r.ko.Spec.StorageConfig.BlockStorage.Enabled
elbEnabled := r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled != nil && *r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled

// All three must be in the same state
if computeEnabled != storageEnabled || storageEnabled != elbEnabled {
return false, fmt.Errorf("invalid Auto Mode configuration: compute, block storage, and load balancing capabilities must all be enabled or disabled together (compute=%v, storage=%v, elb=%v)",
computeEnabled, storageEnabled, elbEnabled)
}

isAutoMode := (computeEnabled && storageEnabled && elbEnabled) || (!computeEnabled && !storageEnabled && !elbEnabled)
return isAutoMode, nil
}


func customPreCompare(
a *resource,
b *resource,
Expand Down Expand Up @@ -380,25 +419,38 @@ func (rm *resourceManager) customUpdate(
return returnClusterUpdating(updatedRes)
}

// Handle computeConfig updates
// Handle computeConfig updates - only for Auto Mode clusters
if delta.DifferentAt("Spec.ComputeConfig") || delta.DifferentAt("Spec.StorageConfig") || delta.DifferentAt("Spec.KubernetesNetworkConfig") {
if err := rm.updateComputeConfig(ctx, desired); err != nil {
awsErr, ok := extractAWSError(err)
rlog.Info("attempting to update AutoMode config",
"error", err,
"isAWSError", ok,
"awsErrorCode", awsErr.Code)

// Check to see if we've raced an async update call and need to requeue
if ok && awsErr.Code == "ResourceInUseException" {
rlog.Info("resource in use, requeueing after async update")
return nil, requeueAfterAsyncUpdate()
// Validate Auto Mode configuration and proceed only if cluster is configured for Auto Mode
isAutoMode, err := isAutoModeCluster(desired)
if err != nil {
return nil, ackerr.NewTerminalError(err)
}
if isAutoMode {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: If this is false due to a user removing the auto-mode flags do we need to take any action? As-is we won't send any API request and leave the EKS cluster with whatever values were already present.

Copy link
Member

@rushmash91 rushmash91 Oct 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup, if it's the an invalid automode payload it's not sent. No action is taken apart from logging it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems like this could lead to some odd behavior from a user's perspective. Here's a sequence of events that could happen with this logic.

  1. User creates a cluster without any auto-mode configs set. Cluster created in EKS has auto-mode disabled as expected.

  2. User adds auto-mode configs with all values true. Cluster is modified to use auto-mode as expected.

  3. User decides they don't want auto-mode and rollback the ACK resource to the original Spec. We log that we are ignoring the diff for a non-automode cluster. However, the actual cluster in EKS still has automode enabled .

Copy link
Member

@michaelhtm michaelhtm Oct 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we still make the API request?
That would make this change still safe even if the API behavior changes
We can mark the encountered error terminal for now

if err := rm.updateComputeConfig(ctx, desired); err != nil {
awsErr, ok := extractAWSError(err)
var awsErrorCode string
if ok && awsErr != nil {
awsErrorCode = awsErr.Code
}
rlog.Info("attempting to update AutoMode config",
"error", err,
"isAWSError", ok,
"awsErrorCode", awsErrorCode)

// Check to see if we've raced an async update call and need to requeue
if ok && awsErr != nil && awsErr.Code == "ResourceInUseException" {
rlog.Info("resource in use, requeueing after async update")
return nil, requeueAfterAsyncUpdate()
}

return nil, fmt.Errorf("failed to update AutoMode config: %w", err)
}

return nil, fmt.Errorf("failed to update AutoMode config: %w", err)
return returnClusterUpdating(updatedRes)
}

return returnClusterUpdating(updatedRes)
// If not Auto Mode, ignore the diff
rlog.Info("ignoring diff on compute/storage/network config for non-Auto Mode cluster")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Q: Will this not result in the delta still being present in the next reconcile loop? Might be tough to avoid this if the API is returning invalid auto-mode flag combinations unless we treat nil as equal to false in the delta comparison.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we will see the delta in logs but not sent the payload for update. Suggestion on what should be done instead?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we're treating nil as false in the validation logic, we could do the same in the delta comparison. That way a partially false set of flags won't register as a diff.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We would need to validate that nil is equivalent to false in the EKS service as well though.

}

// Handle zonalShiftConfig updates
Expand Down
175 changes: 170 additions & 5 deletions test/e2e/tests/test_cluster_automode.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import logging
import time
import pytest
import json

from acktest.k8s import resource as k8s
from acktest.k8s import condition
Expand All @@ -33,9 +34,10 @@
from e2e.common.types import CLUSTER_RESOURCE_PLURAL
from e2e.common.waiter import wait_until_deleted
from e2e.replacement_values import REPLACEMENT_VALUES
from e2e.tests.test_cluster import simple_cluster

MODIFY_WAIT_AFTER_SECONDS = 240
CHECK_STATUS_WAIT_SECONDS = 240
MODIFY_WAIT_AFTER_SECONDS = 60
CHECK_STATUS_WAIT_SECONDS = 30


def wait_for_cluster_active(eks_client, cluster_name):
Expand Down Expand Up @@ -93,8 +95,13 @@ def auto_mode_cluster(eks_client):

yield (ref, cr)

pass

# Try to delete, if doesn't already exist
try:
_, deleted = k8s.delete_custom_resource(ref, 9, 10)
assert deleted
wait_until_deleted(cluster_name)
except Exception:
pass

@service_marker
@pytest.mark.canary
Expand Down Expand Up @@ -141,6 +148,164 @@ def test_create_auto_mode_cluster(self, eks_client, auto_mode_cluster):
time.sleep(CHECK_STATUS_WAIT_SECONDS)

# Clean up
_, deleted = k8s.delete_custom_resource(ref, 3, 10)
_, deleted = k8s.delete_custom_resource(ref, 9, 10)
assert deleted
wait_until_deleted(cluster_name)


@service_marker
@pytest.mark.canary
class TestAutoModeClusterUpdates:
def test_enable_auto_mode_on_standard_cluster(self, eks_client, simple_cluster):
(ref, cr) = simple_cluster
cluster_name = cr["spec"]["name"]

aws_res = eks_client.describe_cluster(name=cluster_name)
assert aws_res is not None

# Wait for the cluster to be ACTIVE and let controller refresh status
wait_for_cluster_active(eks_client, cluster_name)
time.sleep(CHECK_STATUS_WAIT_SECONDS)
get_and_assert_status(ref, "ACTIVE", True)

# Patch to enable auto-mode
patch_enable_auto_mode = {
"spec": {
"computeConfig": {"enabled": True},
"storageConfig": {"blockStorage": {"enabled": True}},
"kubernetesNetworkConfig": {
"elasticLoadBalancing": {"enabled": True},
"ipFamily": "ipv4",
},
}
}
k8s.patch_custom_resource(ref, patch_enable_auto_mode)
time.sleep(MODIFY_WAIT_AFTER_SECONDS)
get_and_assert_status(ref, "UPDATING", False)

# Wait for cluster to become active after update
wait_for_cluster_active(eks_client, cluster_name)
time.sleep(CHECK_STATUS_WAIT_SECONDS)
get_and_assert_status(ref, "ACTIVE", True)

# Verify auto-mode activation via EKS update history (since DescribeCluster may not reflect the fields immediately)
updates_summary = eks_client.list_updates(name=cluster_name)

update_ids = updates_summary.get("updateIds", [])
assert len(update_ids) == 1, (
f"Expected exactly 1 update, got {len(update_ids)}: {update_ids}"
)

update_id = update_ids[0]
upd_desc = eks_client.describe_update(name=cluster_name, updateId=update_id)

update_info = upd_desc["update"]

# Verify update type and status
assert update_info["type"] == "AutoModeUpdate", (
f"Expected AutoModeUpdate, got: {update_info['type']}"
)
assert update_info["status"] == "Successful", (
f"Expected Successful status, got: {update_info['status']}"
)

def test_disable_auto_mode_incorrectly(self, eks_client, auto_mode_cluster):
(ref, cr) = auto_mode_cluster
cluster_name = cr["spec"]["name"]

try:
aws_res = eks_client.describe_cluster(name=cluster_name)
assert aws_res is not None
except eks_client.exceptions.ResourceNotFoundException:
pytest.fail(f"Could not find cluster '{cluster_name}' in EKS")

wait_for_cluster_active(eks_client, cluster_name)
time.sleep(CHECK_STATUS_WAIT_SECONDS)
get_and_assert_status(ref, "ACTIVE", True)

# Patch with incorrect parameters to disable auto-mode
patch_disable_auto_mode_incorrectly = {
"spec": {
"computeConfig": {"enabled": False},
"storageConfig": {
"blockStorage": {
"enabled": True # Should be False
}
},
"kubernetesNetworkConfig": {"elasticLoadBalancing": {"enabled": False}},
}
}

k8s.patch_custom_resource(ref, patch_disable_auto_mode_incorrectly)
time.sleep(MODIFY_WAIT_AFTER_SECONDS)

# The controller should detect the invalid configuration and set a terminal condition.
terminal_condition = "ACK.Terminal"
cond = k8s.get_resource_condition(ref, terminal_condition)
if cond is None:
pytest.fail(
f"Failed to find {terminal_condition} condition in resource {ref}"
)

cond_status = cond.get("status", None)
if str(cond_status) != str(True):
pytest.fail(
f"Expected {terminal_condition} condition to have status True but found {cond_status}"
)

# Verify the error message contains information about invalid Auto Mode configuration
assert "invalid Auto Mode configuration" in cond.get("message", "")

def test_disable_auto_mode_correctly(self, eks_client, auto_mode_cluster):
(ref, cr) = auto_mode_cluster
cluster_name = cr["spec"]["name"]

try:
aws_res = eks_client.describe_cluster(name=cluster_name)
assert aws_res is not None
except eks_client.exceptions.ResourceNotFoundException:
pytest.fail(f"Could not find cluster '{cluster_name}' in EKS")

wait_for_cluster_active(eks_client, cluster_name)
time.sleep(CHECK_STATUS_WAIT_SECONDS)
get_and_assert_status(ref, "ACTIVE", True)

# Patch to disable auto-mode correctly
patch_disable_auto_mode = {
"spec": {
"computeConfig": {"enabled": False},
"storageConfig": {"blockStorage": {"enabled": False}},
"kubernetesNetworkConfig": {"elasticLoadBalancing": {"enabled": False}},
}
}

k8s.patch_custom_resource(ref, patch_disable_auto_mode)
time.sleep(MODIFY_WAIT_AFTER_SECONDS )
get_and_assert_status(ref, "UPDATING", False)

wait_for_cluster_active(eks_client, cluster_name)
time.sleep(CHECK_STATUS_WAIT_SECONDS)
get_and_assert_status(ref, "ACTIVE", True)

# Verify auto-mode is disabled
aws_res = eks_client.describe_cluster(name=cluster_name)
compute_config = aws_res["cluster"].get("computeConfig")
if compute_config is not None:
assert compute_config.get("enabled") is False, (
f"computeConfig.enabled should be False or absent, got: {compute_config.get('enabled')}"
)

storage_config = aws_res["cluster"].get("storageConfig")
if storage_config is not None:
block_storage = storage_config.get("blockStorage", {})
if block_storage:
assert block_storage.get("enabled") is False, (
f"storageConfig.blockStorage.enabled should be False or absent, got: {block_storage.get('enabled')}"
)

k8s_network_config = aws_res["cluster"].get("kubernetesNetworkConfig", {})
elb_config = k8s_network_config.get("elasticLoadBalancing")
if elb_config is not None:
assert elb_config.get("enabled") is False, (
f"kubernetesNetworkConfig.elasticLoadBalancing.enabled should be False or absent, got: {elb_config.get('enabled')}"
)