@@ -21,6 +21,7 @@ import (
2121 "sort"
2222 "strconv"
2323 "strings"
24+ "sync"
2425 "time"
2526
2627 agentapi "go.githedgehog.com/fabric/api/agent/v1beta1"
@@ -3105,6 +3106,299 @@ func (testCtx *VPCPeeringTestCtx) prometheusObservabilityTest(ctx context.Contex
31053106 return false , nil , nil
31063107}
31073108
3109+ // Test DHCP renewal on VPC-attached interfaces
3110+ // Uses 1 server by default, all servers in extended mode
3111+ // Triggers DHCP renewal via networkctl and verifies connectivity
3112+ // Fails if any renewal takes longer than 15 seconds
3113+ func (testCtx * VPCPeeringTestCtx ) dhcpRenewalTest (ctx context.Context ) (bool , []RevertFunc , error ) {
3114+ vpcAttaches := & vpcapi.VPCAttachmentList {}
3115+ if err := testCtx .kube .List (ctx , vpcAttaches ); err != nil {
3116+ return false , nil , fmt .Errorf ("listing VPCAttachments: %w" , err )
3117+ }
3118+
3119+ servers := make ([]ServerWithInterface , 0 , len (vpcAttaches .Items ))
3120+ for _ , attach := range vpcAttaches .Items {
3121+ conn := & wiringapi.Connection {}
3122+ if err := testCtx .kube .Get (ctx , kclient.ObjectKey {
3123+ Namespace : kmetav1 .NamespaceDefault ,
3124+ Name : attach .Spec .Connection ,
3125+ }, conn ); err != nil {
3126+ continue
3127+ }
3128+
3129+ _ , serverNames , _ , _ , err := conn .Spec .Endpoints ()
3130+ if err != nil || len (serverNames ) != 1 {
3131+ continue
3132+ }
3133+
3134+ vpc := & vpcapi.VPC {}
3135+ if err := testCtx .kube .Get (ctx , kclient.ObjectKey {
3136+ Namespace : kmetav1 .NamespaceDefault ,
3137+ Name : attach .Spec .VPCName (),
3138+ }, vpc ); err != nil {
3139+ continue
3140+ }
3141+
3142+ subnet := vpc .Spec .Subnets [attach .Spec .SubnetName ()]
3143+ if subnet == nil || ! subnet .DHCP .Enable {
3144+ continue
3145+ }
3146+
3147+ _ , err = GetServerNetconfCmd (conn , subnet .VLAN , testCtx .setupOpts .HashPolicy )
3148+ if err != nil {
3149+ continue
3150+ }
3151+
3152+ var ifName string
3153+ if conn .Spec .Unbundled != nil {
3154+ ifName = fmt .Sprintf ("%s.%d" , conn .Spec .Unbundled .Link .Server .LocalPortName (), subnet .VLAN )
3155+ } else {
3156+ ifName = fmt .Sprintf ("bond0.%d" , subnet .VLAN )
3157+ }
3158+
3159+ servers = append (servers , ServerWithInterface {
3160+ Name : serverNames [0 ],
3161+ Interface : ifName ,
3162+ })
3163+ }
3164+
3165+ if len (servers ) == 0 {
3166+ slog .Info ("No servers with DHCP interfaces found, skipping DHCP renewal test" )
3167+
3168+ return true , nil , fmt .Errorf ("no servers with DHCP interfaces found" ) //nolint:goerr113
3169+ }
3170+
3171+ testServerCount := 1
3172+ if testCtx .extended {
3173+ testServerCount = len (servers )
3174+ }
3175+
3176+ servers = servers [:testServerCount ]
3177+
3178+ // Determine timeout based on VPC mode
3179+ renewalTimeout := DefaultDHCPRenewalTimeoutL2
3180+ if testCtx .setupOpts .VPCMode == vpcapi .VPCModeL3VNI || testCtx .setupOpts .VPCMode == vpcapi .VPCModeL3Flat {
3181+ renewalTimeout = DefaultDHCPRenewalTimeoutL3
3182+ }
3183+
3184+ slog .Info ("Testing DHCP renewal" , "servers" , len (servers ), "mode" , testCtx .setupOpts .VPCMode , "timeout" , renewalTimeout )
3185+
3186+ var wg sync.WaitGroup
3187+ results := make (chan RenewalResult , len (servers ))
3188+ errors := sync.Map {}
3189+
3190+ for _ , server := range servers {
3191+ wg .Add (1 )
3192+ go func (srv ServerWithInterface ) {
3193+ defer wg .Done ()
3194+
3195+ start := time .Now ()
3196+ err := testCtx .performDHCPRenewal (ctx , srv .Name , srv .Interface )
3197+ duration := time .Since (start )
3198+
3199+ result := RenewalResult {
3200+ Server : srv .Name ,
3201+ Duration : duration ,
3202+ Error : err ,
3203+ }
3204+
3205+ if err != nil {
3206+ errors .Store (fmt .Sprintf ("dhcp-renewal--%s" , srv .Name ), err )
3207+ }
3208+
3209+ results <- result
3210+ }(server )
3211+ }
3212+
3213+ wg .Wait ()
3214+ close (results )
3215+
3216+ var failures []string
3217+ var slowRenewals []string
3218+ successCount := 0
3219+ maxDuration := time .Duration (0 )
3220+
3221+ for result := range results {
3222+ if result .Error != nil {
3223+ failures = append (failures , fmt .Sprintf ("%s: %v" , result .Server , result .Error ))
3224+ } else {
3225+ successCount ++
3226+ if result .Duration > maxDuration {
3227+ maxDuration = result .Duration
3228+ }
3229+ if result .Duration > renewalTimeout {
3230+ slowRenewals = append (slowRenewals ,
3231+ fmt .Sprintf ("%s: %v" , result .Server , result .Duration ))
3232+ }
3233+ }
3234+ }
3235+
3236+ var additionalErrors []string
3237+ errors .Range (func (key , value any ) bool {
3238+ additionalErrors = append (additionalErrors , fmt .Sprintf ("%s: %v" , key , value ))
3239+
3240+ return true
3241+ })
3242+
3243+ if len (failures ) > 0 || len (additionalErrors ) > 0 {
3244+ failures = append (failures , additionalErrors ... )
3245+
3246+ return false , nil , fmt .Errorf ("DHCP renewal failures: %v" , failures ) //nolint:goerr113
3247+ }
3248+
3249+ if len (slowRenewals ) > 0 {
3250+ return false , nil , fmt .Errorf ("slow DHCP renewals detected (>%v): %v" , renewalTimeout , slowRenewals ) //nolint:goerr113
3251+ }
3252+
3253+ slog .Info ("DHCP renewal test passed" , "servers" , len (servers ), "maxDuration" , maxDuration )
3254+
3255+ return false , nil , nil
3256+ }
3257+
3258+ const (
3259+ DefaultDHCPRenewalTimeoutL2 = 10 * time .Second // L2VNI should be fast
3260+ DefaultDHCPRenewalTimeoutL3 = 20 * time .Second // L3VNI 2-step process
3261+ DHCPRenewalMaxWait = 25 * time .Second // Hard timeout for any renewal
3262+ )
3263+
3264+ type ServerWithInterface struct {
3265+ Name string
3266+ Interface string
3267+ }
3268+
3269+ type RenewalResult struct {
3270+ Server string
3271+ Duration time.Duration
3272+ Error error
3273+ }
3274+
3275+ func (testCtx * VPCPeeringTestCtx ) performDHCPRenewal (ctx context.Context , serverName , ifName string ) error {
3276+ isL3Mode := testCtx .setupOpts .VPCMode == vpcapi .VPCModeL3VNI || testCtx .setupOpts .VPCMode == vpcapi .VPCModeL3Flat
3277+
3278+ ssh , err := testCtx .getSSH (ctx , serverName )
3279+ if err != nil {
3280+ return fmt .Errorf ("getting ssh config for server %s: %w" , serverName , err )
3281+ }
3282+
3283+ // Log initial state
3284+ initialOut , _ , _ := ssh .Run (ctx ,
3285+ fmt .Sprintf ("ip addr show dev %s proto 4 | grep valid_lft" , ifName ))
3286+ slog .Debug ("Initial lease state" , "server" , serverName , "interface" , ifName , "lease" , strings .TrimSpace (initialOut ))
3287+
3288+ // Parse initial lease time
3289+ var initialLease int
3290+ if initialOut != "" {
3291+ tokens := strings .Split (strings .TrimLeft (initialOut , " \t " ), " " )
3292+ if len (tokens ) >= 2 {
3293+ stripped , _ := strings .CutSuffix (tokens [1 ], "sec" )
3294+ if lease , parseErr := strconv .Atoi (stripped ); parseErr == nil {
3295+ initialLease = lease
3296+ }
3297+ }
3298+ }
3299+
3300+ _ , _ , err = ssh .Run (ctx , fmt .Sprintf ("sudo networkctl reconfigure %s" , ifName ))
3301+ if err != nil {
3302+ return fmt .Errorf ("networkctl reconfigure failed: %w" , err )
3303+ }
3304+
3305+ renewalCtx , cancel := context .WithTimeout (ctx , DHCPRenewalMaxWait )
3306+ defer cancel ()
3307+
3308+ pollCount := 0
3309+ shortLeaseFound := false
3310+
3311+ for {
3312+ pollCount ++
3313+ out , _ , err := ssh .Run (ctx , fmt .Sprintf ("ip addr show dev %s proto 4" , ifName ))
3314+
3315+ // Parse the output to understand what's happening
3316+ var leaseInfo string
3317+ var hasInterface bool
3318+ var hasIP bool
3319+
3320+ if err == nil {
3321+ hasInterface = true
3322+ if strings .Contains (out , "valid_lft" ) {
3323+ // Extract just the lease line for logging
3324+ lines := strings .Split (out , "\n " )
3325+ for _ , line := range lines {
3326+ if strings .Contains (line , "valid_lft" ) {
3327+ leaseInfo = strings .TrimSpace (line )
3328+ hasIP = true
3329+
3330+ break
3331+ }
3332+ }
3333+ }
3334+ }
3335+
3336+ // Improved logging based on interface state
3337+ if pollCount % 3 == 0 || err != nil || ! hasIP {
3338+ var status string
3339+ switch {
3340+ case err != nil :
3341+ status = "interface command failed"
3342+ case ! hasInterface :
3343+ status = "interface not found"
3344+ case ! hasIP :
3345+ status = "interface up but no DHCP lease"
3346+ default :
3347+ status = "interface has lease"
3348+ }
3349+
3350+ slog .Debug ("Lease polling" , "server" , serverName , "interface" , ifName , "poll" , pollCount , "status" , status , "lease" , leaseInfo , "error" , err )
3351+ }
3352+
3353+ if hasIP && leaseInfo != "" {
3354+ tokens := strings .Split (strings .TrimLeft (leaseInfo , " \t " ), " " )
3355+ if len (tokens ) >= 2 {
3356+ stripped , _ := strings .CutSuffix (tokens [1 ], "sec" )
3357+ if currentLease , parseErr := strconv .Atoi (stripped ); parseErr == nil {
3358+ if isL3Mode {
3359+ // L3VNI mode: REQUIRE 2-step process
3360+ if ! shortLeaseFound && currentLease >= 5 && currentLease <= 15 {
3361+ shortLeaseFound = true
3362+ slog .Debug ("L3 mode short lease detected" , "server" , serverName , "interface" , ifName , "lease" , currentLease )
3363+
3364+ continue
3365+ }
3366+
3367+ // Only accept full lease if we saw the short lease first
3368+ if currentLease >= 3590 {
3369+ if ! shortLeaseFound {
3370+ return fmt .Errorf ("L3VNI mode requires 2-step lease process, but no short lease detected before full lease on %s" , ifName ) //nolint:goerr113
3371+ }
3372+ slog .Debug ("L3 mode 2-step lease completed" , "server" , serverName , "interface" , ifName , "polls" , pollCount , "initial_lease" , initialLease , "final_lease" , currentLease )
3373+
3374+ return nil
3375+ }
3376+ } else if currentLease > initialLease - 30 {
3377+ // L2VNI mode: Accept direct renewal to full lease
3378+ slog .Debug ("L2 mode renewal completed" , "server" , serverName , "interface" , ifName , "polls" , pollCount , "initial_lease" , initialLease , "final_lease" , currentLease )
3379+
3380+ return nil
3381+ }
3382+ }
3383+ }
3384+ }
3385+
3386+ select {
3387+ case <- renewalCtx .Done ():
3388+ finalOut , _ , _ := ssh .Run (ctx , fmt .Sprintf ("ip addr show dev %s proto 4 | grep valid_lft" , ifName ))
3389+
3390+ if isL3Mode && ! shortLeaseFound {
3391+ return fmt .Errorf ("timeout waiting for L3VNI short lease on %s after %d polls (final: %s): %w" ,
3392+ ifName , pollCount , strings .TrimSpace (finalOut ), renewalCtx .Err ())
3393+ }
3394+
3395+ return fmt .Errorf ("timeout after %d polls waiting for DHCP renewal on %s (initial: %d, final: %s): %w" ,
3396+ pollCount , ifName , initialLease , strings .TrimSpace (finalOut ), renewalCtx .Err ())
3397+ case <- time .After (1 * time .Second ):
3398+ }
3399+ }
3400+ }
3401+
31083402// Utilities and suite runners
31093403
31103404func makeTestCtx (kube kclient.Client , setupOpts SetupVPCsOpts , vlabCfg * Config , vlab * VLAB , wipeBetweenTests bool , rtOpts ReleaseTestOpts ) * VPCPeeringTestCtx {
@@ -3568,6 +3862,10 @@ func makeVpcPeeringsSingleVPCSuite(testCtx *VPCPeeringTestCtx) *JUnitTestSuite {
35683862 Name : "DNS/NTP/MTU/DHCP lease" ,
35693863 F : testCtx .dnsNtpMtuTest ,
35703864 },
3865+ {
3866+ Name : "DHCP renewal" ,
3867+ F : testCtx .dhcpRenewalTest ,
3868+ },
35713869 {
35723870 Name : "MCLAG Failover" ,
35733871 F : testCtx .mclagTest ,
0 commit comments