-
Notifications
You must be signed in to change notification settings - Fork 44
Fix nil pointer panic and spurious Auto Mode updates #171
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -184,6 +184,45 @@ func (rm *resourceManager) clusterInUse(ctx context.Context, r *resource) (bool, | |
return (nodes != nil && len(nodes.Nodegroups) > 0), nil | ||
} | ||
|
||
// isAutoModeCluster returns true if the cluster is configured for EKS Auto Mode. | ||
// According to AWS documentation, compute, block storage, and load balancing capabilities | ||
// must all be enabled or disabled together. Any partial configuration is invalid. | ||
// Returns an error for invalid configurations. | ||
func isAutoModeCluster(r *resource) (bool, error) { | ||
if r == nil || r.ko == nil { | ||
return false, nil | ||
} | ||
|
||
hasComputeConfig := r.ko.Spec.ComputeConfig != nil | ||
hasStorageConfig := r.ko.Spec.StorageConfig != nil && r.ko.Spec.StorageConfig.BlockStorage != nil | ||
hasELBConfig := r.ko.Spec.KubernetesNetworkConfig != nil && r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing != nil | ||
|
||
// If no Auto Mode configuration is present, it's valid (not an Auto Mode cluster) | ||
if !hasComputeConfig && !hasStorageConfig && !hasELBConfig { | ||
return false, nil | ||
} | ||
|
||
// If any Auto Mode configuration is present, ALL must be present | ||
if !hasComputeConfig || !hasStorageConfig || !hasELBConfig { | ||
return false, fmt.Errorf("invalid Auto Mode configuration: when configuring Auto Mode, all three capabilities must be specified (compute=%v, storage=%v, elb=%v)", | ||
hasComputeConfig, hasStorageConfig, hasELBConfig) | ||
} | ||
|
||
computeEnabled := r.ko.Spec.ComputeConfig.Enabled != nil && *r.ko.Spec.ComputeConfig.Enabled | ||
storageEnabled := r.ko.Spec.StorageConfig.BlockStorage.Enabled != nil && *r.ko.Spec.StorageConfig.BlockStorage.Enabled | ||
elbEnabled := r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled != nil && *r.ko.Spec.KubernetesNetworkConfig.ElasticLoadBalancing.Enabled | ||
|
||
// All three must be in the same state | ||
if computeEnabled != storageEnabled || storageEnabled != elbEnabled { | ||
return false, fmt.Errorf("invalid Auto Mode configuration: compute, block storage, and load balancing capabilities must all be enabled or disabled together (compute=%v, storage=%v, elb=%v)", | ||
computeEnabled, storageEnabled, elbEnabled) | ||
} | ||
|
||
isAutoMode := (computeEnabled && storageEnabled && elbEnabled) || (!computeEnabled && !storageEnabled && !elbEnabled) | ||
return isAutoMode, nil | ||
} | ||
|
||
|
||
func customPreCompare( | ||
a *resource, | ||
b *resource, | ||
|
@@ -380,25 +419,38 @@ func (rm *resourceManager) customUpdate( | |
return returnClusterUpdating(updatedRes) | ||
} | ||
|
||
// Handle computeConfig updates | ||
// Handle computeConfig updates - only for Auto Mode clusters | ||
if delta.DifferentAt("Spec.ComputeConfig") || delta.DifferentAt("Spec.StorageConfig") || delta.DifferentAt("Spec.KubernetesNetworkConfig") { | ||
if err := rm.updateComputeConfig(ctx, desired); err != nil { | ||
awsErr, ok := extractAWSError(err) | ||
rlog.Info("attempting to update AutoMode config", | ||
"error", err, | ||
"isAWSError", ok, | ||
"awsErrorCode", awsErr.Code) | ||
|
||
// Check to see if we've raced an async update call and need to requeue | ||
if ok && awsErr.Code == "ResourceInUseException" { | ||
rlog.Info("resource in use, requeueing after async update") | ||
return nil, requeueAfterAsyncUpdate() | ||
// Validate Auto Mode configuration and proceed only if cluster is configured for Auto Mode | ||
isAutoMode, err := isAutoModeCluster(desired) | ||
if err != nil { | ||
return nil, ackerr.NewTerminalError(err) | ||
} | ||
if isAutoMode { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Q: If this is false due to a user removing the auto-mode flags do we need to take any action? As-is we won't send any API request and leave the EKS cluster with whatever values were already present. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yup, if it's the an invalid automode payload it's not sent. No action is taken apart from logging it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems like this could lead to some odd behavior from a user's perspective. Here's a sequence of events that could happen with this logic.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we still make the API request? |
||
if err := rm.updateComputeConfig(ctx, desired); err != nil { | ||
awsErr, ok := extractAWSError(err) | ||
var awsErrorCode string | ||
if ok && awsErr != nil { | ||
awsErrorCode = awsErr.Code | ||
} | ||
rlog.Info("attempting to update AutoMode config", | ||
"error", err, | ||
"isAWSError", ok, | ||
"awsErrorCode", awsErrorCode) | ||
|
||
// Check to see if we've raced an async update call and need to requeue | ||
if ok && awsErr != nil && awsErr.Code == "ResourceInUseException" { | ||
rlog.Info("resource in use, requeueing after async update") | ||
return nil, requeueAfterAsyncUpdate() | ||
} | ||
|
||
return nil, fmt.Errorf("failed to update AutoMode config: %w", err) | ||
} | ||
|
||
return nil, fmt.Errorf("failed to update AutoMode config: %w", err) | ||
return returnClusterUpdating(updatedRes) | ||
} | ||
|
||
return returnClusterUpdating(updatedRes) | ||
// If not Auto Mode, ignore the diff | ||
rlog.Info("ignoring diff on compute/storage/network config for non-Auto Mode cluster") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Q: Will this not result in the delta still being present in the next reconcile loop? Might be tough to avoid this if the API is returning invalid auto-mode flag combinations unless we treat There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, we will see the delta in logs but not sent the payload for update. Suggestion on what should be done instead? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we're treating nil as false in the validation logic, we could do the same in the delta comparison. That way a partially false set of flags won't register as a diff. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We would need to validate that nil is equivalent to false in the EKS service as well though. |
||
} | ||
|
||
// Handle zonalShiftConfig updates | ||
|
Uh oh!
There was an error while loading. Please reload this page.