Skip to content

Commit f796072

Browse files
committed
chore: add DHCP renew release test
Signed-off-by: Pau Capdevila <[email protected]>
1 parent 3ea41f6 commit f796072

File tree

1 file changed

+298
-0
lines changed

1 file changed

+298
-0
lines changed

pkg/hhfab/release.go

Lines changed: 298 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"sort"
2222
"strconv"
2323
"strings"
24+
"sync"
2425
"time"
2526

2627
agentapi "go.githedgehog.com/fabric/api/agent/v1beta1"
@@ -3105,6 +3106,299 @@ func (testCtx *VPCPeeringTestCtx) prometheusObservabilityTest(ctx context.Contex
31053106
return false, nil, nil
31063107
}
31073108

3109+
// Test DHCP renewal on VPC-attached interfaces
3110+
// Uses 1 server by default, all servers in extended mode
3111+
// Triggers DHCP renewal via networkctl and verifies connectivity
3112+
// Fails if any renewal takes longer than 15 seconds
3113+
func (testCtx *VPCPeeringTestCtx) dhcpRenewalTest(ctx context.Context) (bool, []RevertFunc, error) {
3114+
vpcAttaches := &vpcapi.VPCAttachmentList{}
3115+
if err := testCtx.kube.List(ctx, vpcAttaches); err != nil {
3116+
return false, nil, fmt.Errorf("listing VPCAttachments: %w", err)
3117+
}
3118+
3119+
servers := make([]ServerWithInterface, 0, len(vpcAttaches.Items))
3120+
for _, attach := range vpcAttaches.Items {
3121+
conn := &wiringapi.Connection{}
3122+
if err := testCtx.kube.Get(ctx, kclient.ObjectKey{
3123+
Namespace: kmetav1.NamespaceDefault,
3124+
Name: attach.Spec.Connection,
3125+
}, conn); err != nil {
3126+
continue
3127+
}
3128+
3129+
_, serverNames, _, _, err := conn.Spec.Endpoints()
3130+
if err != nil || len(serverNames) != 1 {
3131+
continue
3132+
}
3133+
3134+
vpc := &vpcapi.VPC{}
3135+
if err := testCtx.kube.Get(ctx, kclient.ObjectKey{
3136+
Namespace: kmetav1.NamespaceDefault,
3137+
Name: attach.Spec.VPCName(),
3138+
}, vpc); err != nil {
3139+
continue
3140+
}
3141+
3142+
subnet := vpc.Spec.Subnets[attach.Spec.SubnetName()]
3143+
if subnet == nil || !subnet.DHCP.Enable {
3144+
continue
3145+
}
3146+
3147+
_, err = GetServerNetconfCmd(conn, subnet.VLAN, testCtx.setupOpts.HashPolicy)
3148+
if err != nil {
3149+
continue
3150+
}
3151+
3152+
var ifName string
3153+
if conn.Spec.Unbundled != nil {
3154+
ifName = fmt.Sprintf("%s.%d", conn.Spec.Unbundled.Link.Server.LocalPortName(), subnet.VLAN)
3155+
} else {
3156+
ifName = fmt.Sprintf("bond0.%d", subnet.VLAN)
3157+
}
3158+
3159+
servers = append(servers, ServerWithInterface{
3160+
Name: serverNames[0],
3161+
Interface: ifName,
3162+
})
3163+
}
3164+
3165+
if len(servers) == 0 {
3166+
slog.Info("No servers with DHCP interfaces found, skipping DHCP renewal test")
3167+
3168+
return true, nil, fmt.Errorf("no servers with DHCP interfaces found") //nolint:goerr113
3169+
}
3170+
3171+
testServerCount := 1
3172+
if testCtx.extended {
3173+
testServerCount = len(servers)
3174+
}
3175+
3176+
servers = servers[:testServerCount]
3177+
3178+
// Determine timeout based on VPC mode
3179+
renewalTimeout := DefaultDHCPRenewalTimeoutL2
3180+
if testCtx.setupOpts.VPCMode == vpcapi.VPCModeL3VNI || testCtx.setupOpts.VPCMode == vpcapi.VPCModeL3Flat {
3181+
renewalTimeout = DefaultDHCPRenewalTimeoutL3
3182+
}
3183+
3184+
slog.Info("Testing DHCP renewal", "servers", len(servers), "mode", testCtx.setupOpts.VPCMode, "timeout", renewalTimeout)
3185+
3186+
var wg sync.WaitGroup
3187+
results := make(chan RenewalResult, len(servers))
3188+
errors := sync.Map{}
3189+
3190+
for _, server := range servers {
3191+
wg.Add(1)
3192+
go func(srv ServerWithInterface) {
3193+
defer wg.Done()
3194+
3195+
start := time.Now()
3196+
err := testCtx.performDHCPRenewal(ctx, srv.Name, srv.Interface)
3197+
duration := time.Since(start)
3198+
3199+
result := RenewalResult{
3200+
Server: srv.Name,
3201+
Duration: duration,
3202+
Error: err,
3203+
}
3204+
3205+
if err != nil {
3206+
errors.Store(fmt.Sprintf("dhcp-renewal--%s", srv.Name), err)
3207+
}
3208+
3209+
results <- result
3210+
}(server)
3211+
}
3212+
3213+
wg.Wait()
3214+
close(results)
3215+
3216+
var failures []string
3217+
var slowRenewals []string
3218+
successCount := 0
3219+
maxDuration := time.Duration(0)
3220+
3221+
for result := range results {
3222+
if result.Error != nil {
3223+
failures = append(failures, fmt.Sprintf("%s: %v", result.Server, result.Error))
3224+
} else {
3225+
successCount++
3226+
if result.Duration > maxDuration {
3227+
maxDuration = result.Duration
3228+
}
3229+
if result.Duration > renewalTimeout {
3230+
slowRenewals = append(slowRenewals,
3231+
fmt.Sprintf("%s: %v", result.Server, result.Duration))
3232+
}
3233+
}
3234+
}
3235+
3236+
var additionalErrors []string
3237+
errors.Range(func(key, value any) bool {
3238+
additionalErrors = append(additionalErrors, fmt.Sprintf("%s: %v", key, value))
3239+
3240+
return true
3241+
})
3242+
3243+
if len(failures) > 0 || len(additionalErrors) > 0 {
3244+
failures = append(failures, additionalErrors...)
3245+
3246+
return false, nil, fmt.Errorf("DHCP renewal failures: %v", failures) //nolint:goerr113
3247+
}
3248+
3249+
if len(slowRenewals) > 0 {
3250+
return false, nil, fmt.Errorf("slow DHCP renewals detected (>%v): %v", renewalTimeout, slowRenewals) //nolint:goerr113
3251+
}
3252+
3253+
slog.Info("DHCP renewal test passed", "servers", len(servers), "maxDuration", maxDuration)
3254+
3255+
return false, nil, nil
3256+
}
3257+
3258+
const (
3259+
DefaultDHCPRenewalTimeoutL2 = 10 * time.Second // L2VNI should be fast
3260+
DefaultDHCPRenewalTimeoutL3 = 20 * time.Second // L3VNI 2-step process
3261+
DHCPRenewalMaxWait = 25 * time.Second // Hard timeout for any renewal
3262+
)
3263+
3264+
type ServerWithInterface struct {
3265+
Name string
3266+
Interface string
3267+
}
3268+
3269+
type RenewalResult struct {
3270+
Server string
3271+
Duration time.Duration
3272+
Error error
3273+
}
3274+
3275+
func (testCtx *VPCPeeringTestCtx) performDHCPRenewal(ctx context.Context, serverName, ifName string) error {
3276+
isL3Mode := testCtx.setupOpts.VPCMode == vpcapi.VPCModeL3VNI || testCtx.setupOpts.VPCMode == vpcapi.VPCModeL3Flat
3277+
3278+
ssh, err := testCtx.getSSH(ctx, serverName)
3279+
if err != nil {
3280+
return fmt.Errorf("getting ssh config for server %s: %w", serverName, err)
3281+
}
3282+
3283+
// Log initial state
3284+
initialOut, _, _ := ssh.Run(ctx,
3285+
fmt.Sprintf("ip addr show dev %s proto 4 | grep valid_lft", ifName))
3286+
slog.Debug("Initial lease state", "server", serverName, "interface", ifName, "lease", strings.TrimSpace(initialOut))
3287+
3288+
// Parse initial lease time
3289+
var initialLease int
3290+
if initialOut != "" {
3291+
tokens := strings.Split(strings.TrimLeft(initialOut, " \t"), " ")
3292+
if len(tokens) >= 2 {
3293+
stripped, _ := strings.CutSuffix(tokens[1], "sec")
3294+
if lease, parseErr := strconv.Atoi(stripped); parseErr == nil {
3295+
initialLease = lease
3296+
}
3297+
}
3298+
}
3299+
3300+
_, _, err = ssh.Run(ctx, fmt.Sprintf("sudo networkctl reconfigure %s", ifName))
3301+
if err != nil {
3302+
return fmt.Errorf("networkctl reconfigure failed: %w", err)
3303+
}
3304+
3305+
renewalCtx, cancel := context.WithTimeout(ctx, DHCPRenewalMaxWait)
3306+
defer cancel()
3307+
3308+
pollCount := 0
3309+
shortLeaseFound := false
3310+
3311+
for {
3312+
pollCount++
3313+
out, _, err := ssh.Run(ctx, fmt.Sprintf("ip addr show dev %s proto 4", ifName))
3314+
3315+
// Parse the output to understand what's happening
3316+
var leaseInfo string
3317+
var hasInterface bool
3318+
var hasIP bool
3319+
3320+
if err == nil {
3321+
hasInterface = true
3322+
if strings.Contains(out, "valid_lft") {
3323+
// Extract just the lease line for logging
3324+
lines := strings.Split(out, "\n")
3325+
for _, line := range lines {
3326+
if strings.Contains(line, "valid_lft") {
3327+
leaseInfo = strings.TrimSpace(line)
3328+
hasIP = true
3329+
3330+
break
3331+
}
3332+
}
3333+
}
3334+
}
3335+
3336+
// Improved logging based on interface state
3337+
if pollCount%3 == 0 || err != nil || !hasIP {
3338+
var status string
3339+
switch {
3340+
case err != nil:
3341+
status = "interface command failed"
3342+
case !hasInterface:
3343+
status = "interface not found"
3344+
case !hasIP:
3345+
status = "interface up but no DHCP lease"
3346+
default:
3347+
status = "interface has lease"
3348+
}
3349+
3350+
slog.Debug("Lease polling", "server", serverName, "interface", ifName, "poll", pollCount, "status", status, "lease", leaseInfo, "error", err)
3351+
}
3352+
3353+
if hasIP && leaseInfo != "" {
3354+
tokens := strings.Split(strings.TrimLeft(leaseInfo, " \t"), " ")
3355+
if len(tokens) >= 2 {
3356+
stripped, _ := strings.CutSuffix(tokens[1], "sec")
3357+
if currentLease, parseErr := strconv.Atoi(stripped); parseErr == nil {
3358+
if isL3Mode {
3359+
// L3VNI mode: REQUIRE 2-step process
3360+
if !shortLeaseFound && currentLease >= 5 && currentLease <= 15 {
3361+
shortLeaseFound = true
3362+
slog.Debug("L3 mode short lease detected", "server", serverName, "interface", ifName, "lease", currentLease)
3363+
3364+
continue
3365+
}
3366+
3367+
// Only accept full lease if we saw the short lease first
3368+
if currentLease >= 3590 {
3369+
if !shortLeaseFound {
3370+
return fmt.Errorf("L3VNI mode requires 2-step lease process, but no short lease detected before full lease on %s", ifName) //nolint:goerr113
3371+
}
3372+
slog.Debug("L3 mode 2-step lease completed", "server", serverName, "interface", ifName, "polls", pollCount, "initial_lease", initialLease, "final_lease", currentLease)
3373+
3374+
return nil
3375+
}
3376+
} else if currentLease > initialLease-30 {
3377+
// L2VNI mode: Accept direct renewal to full lease
3378+
slog.Debug("L2 mode renewal completed", "server", serverName, "interface", ifName, "polls", pollCount, "initial_lease", initialLease, "final_lease", currentLease)
3379+
3380+
return nil
3381+
}
3382+
}
3383+
}
3384+
}
3385+
3386+
select {
3387+
case <-renewalCtx.Done():
3388+
finalOut, _, _ := ssh.Run(ctx, fmt.Sprintf("ip addr show dev %s proto 4 | grep valid_lft", ifName))
3389+
3390+
if isL3Mode && !shortLeaseFound {
3391+
return fmt.Errorf("timeout waiting for L3VNI short lease on %s after %d polls (final: %s): %w",
3392+
ifName, pollCount, strings.TrimSpace(finalOut), renewalCtx.Err())
3393+
}
3394+
3395+
return fmt.Errorf("timeout after %d polls waiting for DHCP renewal on %s (initial: %d, final: %s): %w",
3396+
pollCount, ifName, initialLease, strings.TrimSpace(finalOut), renewalCtx.Err())
3397+
case <-time.After(1 * time.Second):
3398+
}
3399+
}
3400+
}
3401+
31083402
// Utilities and suite runners
31093403

31103404
func makeTestCtx(kube kclient.Client, setupOpts SetupVPCsOpts, vlabCfg *Config, vlab *VLAB, wipeBetweenTests bool, rtOpts ReleaseTestOpts) *VPCPeeringTestCtx {
@@ -3568,6 +3862,10 @@ func makeVpcPeeringsSingleVPCSuite(testCtx *VPCPeeringTestCtx) *JUnitTestSuite {
35683862
Name: "DNS/NTP/MTU/DHCP lease",
35693863
F: testCtx.dnsNtpMtuTest,
35703864
},
3865+
{
3866+
Name: "DHCP renewal",
3867+
F: testCtx.dhcpRenewalTest,
3868+
},
35713869
{
35723870
Name: "MCLAG Failover",
35733871
F: testCtx.mclagTest,

0 commit comments

Comments
 (0)