@@ -27,7 +27,7 @@ type CheckpointAndPushResult =
27
27
| { success : true ; checkpoint : CheckpointData }
28
28
| {
29
29
success : false ;
30
- reason ?: "CANCELED" | "DISABLED" | " ERROR" | "IN_PROGRESS" | "NO_SUPPORT " | "SKIP_RETRYING" ;
30
+ reason ?: "CANCELED" | "ERROR" | "SKIP_RETRYING" ;
31
31
} ;
32
32
33
33
type CheckpointData = {
@@ -87,9 +87,14 @@ export class Checkpointer {
87
87
#dockerMode: boolean ;
88
88
89
89
#logger = new SimpleStructuredLogger ( "checkpointer" ) ;
90
- #abortControllers = new Map < string , AbortController > ( ) ;
90
+
91
91
#failedCheckpoints = new Map < string , unknown > ( ) ;
92
- #waitingToCheckpoint = new Set < string > ( ) ;
92
+
93
+ // Indexed by run ID
94
+ #runAbortControllers = new Map <
95
+ string ,
96
+ { signal : AbortSignal ; abort : AbortController [ "abort" ] }
97
+ > ( ) ;
93
98
94
99
private registryHost : string ;
95
100
private registryNamespace : string ;
@@ -196,25 +201,73 @@ export class Checkpointer {
196
201
const start = performance . now ( ) ;
197
202
this . #logger. log ( `checkpointAndPush() start` , { start, opts } ) ;
198
203
199
- let interval : NodeJS . Timer | undefined ;
204
+ const { runId } = opts ;
200
205
206
+ let interval : NodeJS . Timer | undefined ;
201
207
if ( opts . shouldHeartbeat ) {
202
208
interval = setInterval ( ( ) => {
203
- this . #logger. log ( "Sending heartbeat" , { runId : opts . runId } ) ;
204
- this . opts . heartbeat ( opts . runId ) ;
209
+ this . #logger. log ( "Sending heartbeat" , { runId } ) ;
210
+ this . opts . heartbeat ( runId ) ;
205
211
} , 20_000 ) ;
206
212
}
207
213
214
+ const controller = new AbortController ( ) ;
215
+ const signal = controller . signal ;
216
+ const abort = controller . abort . bind ( controller ) ;
217
+
218
+ const onAbort = ( ) => {
219
+ this . #logger. error ( "Checkpoint aborted" , { runId, options : opts } ) ;
220
+ } ;
221
+
222
+ signal . addEventListener ( "abort" , onAbort , { once : true } ) ;
223
+
224
+ const removeCurrentAbortController = ( ) => {
225
+ const controller = this . #runAbortControllers. get ( runId ) ;
226
+
227
+ // Ensure only the current controller is removed
228
+ if ( controller && controller . signal === signal ) {
229
+ this . #runAbortControllers. delete ( runId ) ;
230
+ }
231
+
232
+ // Remove the abort listener in case it hasn't fired
233
+ signal . removeEventListener ( "abort" , onAbort ) ;
234
+ } ;
235
+
236
+ if ( ! this . #dockerMode && ! this . #canCheckpoint) {
237
+ this . #logger. error ( "No checkpoint support. Simulation requires docker." ) ;
238
+ this . #failCheckpoint( runId , "NO_SUPPORT" ) ;
239
+ return ;
240
+ }
241
+
242
+ if ( this . #isRunCheckpointing( runId ) ) {
243
+ this . #logger. error ( "Checkpoint procedure already in progress" , { options : opts } ) ;
244
+ this . #failCheckpoint( runId , "IN_PROGRESS" ) ;
245
+ return ;
246
+ }
247
+
248
+ // This is a new checkpoint, clear any last failure for this run
249
+ this . #clearFailedCheckpoint( runId ) ;
250
+
251
+ if ( this . disableCheckpointSupport ) {
252
+ this . #logger. error ( "Checkpoint support disabled" , { options : opts } ) ;
253
+ this . #failCheckpoint( runId , "DISABLED" ) ;
254
+ return ;
255
+ }
256
+
257
+ this . #runAbortControllers. set ( runId , { signal, abort } ) ;
258
+
208
259
try {
209
- const result = await this . #checkpointAndPushWithBackoff( opts , delayMs ) ;
260
+ const result = await this . #checkpointAndPushWithBackoff( opts , { delayMs, signal } ) ;
210
261
211
262
const end = performance . now ( ) ;
212
263
this . #logger. log ( `checkpointAndPush() end` , {
213
264
start,
214
265
end,
215
266
diff : end - start ,
267
+ diffWithoutDelay : end - start - ( delayMs ?? 0 ) ,
216
268
opts,
217
269
success : result . success ,
270
+ delayMs,
218
271
} ) ;
219
272
220
273
if ( ! result . success ) {
@@ -226,40 +279,41 @@ export class Checkpointer {
226
279
if ( opts . shouldHeartbeat ) {
227
280
clearInterval ( interval ) ;
228
281
}
282
+ removeCurrentAbortController ( ) ;
229
283
}
230
284
}
231
285
232
- #isCheckpointing ( runId : string ) {
233
- return this . #abortControllers . has ( runId ) || this . #waitingToCheckpoint . has ( runId ) ;
286
+ #isRunCheckpointing ( runId : string ) {
287
+ return this . #runAbortControllers . has ( runId ) ;
234
288
}
235
289
236
- cancelCheckpoint ( runId : string ) : boolean {
290
+ cancelAllCheckpointsForRun ( runId : string ) : boolean {
291
+ this . #logger. log ( "cancelAllCheckpointsForRun: call" , { runId } ) ;
292
+
237
293
// If the last checkpoint failed, pretend we canceled it
238
294
// This ensures tasks don't wait for external resume messages to continue
239
295
if ( this . #hasFailedCheckpoint( runId ) ) {
296
+ this . #logger. log ( "cancelAllCheckpointsForRun: hasFailedCheckpoint" , { runId } ) ;
240
297
this . #clearFailedCheckpoint( runId ) ;
241
298
return true ;
242
299
}
243
300
244
- if ( this . #waitingToCheckpoint. has ( runId ) ) {
245
- this . #waitingToCheckpoint. delete ( runId ) ;
246
- return true ;
247
- }
248
-
249
- const controller = this . #abortControllers. get ( runId ) ;
301
+ const controller = this . #runAbortControllers. get ( runId ) ;
250
302
251
303
if ( ! controller ) {
252
- this . #logger. debug ( "Nothing to cancel " , { runId } ) ;
304
+ this . #logger. debug ( "cancelAllCheckpointsForRun: no abort controller " , { runId } ) ;
253
305
return false ;
254
306
}
255
307
256
- if ( controller . signal . aborted ) {
257
- this . #logger. debug ( "Controller already aborted" , { runId } ) ;
308
+ const { abort, signal } = controller ;
309
+
310
+ if ( signal . aborted ) {
311
+ this . #logger. debug ( "cancelAllCheckpointsForRun: signal already aborted" , { runId } ) ;
258
312
return false ;
259
313
}
260
314
261
- controller . abort ( "cancelCheckpoint()" ) ;
262
- this . #abortControllers . delete ( runId ) ;
315
+ abort ( "cancelCheckpoint()" ) ;
316
+ this . #runAbortControllers . delete ( runId ) ;
263
317
264
318
return true ;
265
319
}
@@ -272,19 +326,16 @@ export class Checkpointer {
272
326
deploymentVersion,
273
327
attemptNumber,
274
328
} : CheckpointAndPushOptions ,
275
- delayMs ?: number
329
+ { delayMs, signal } : { delayMs ?: number ; signal : AbortSignal }
276
330
) : Promise < CheckpointAndPushResult > {
277
331
if ( delayMs && delayMs > 0 ) {
278
332
this . #logger. log ( "Delaying checkpoint" , { runId, delayMs } ) ;
279
333
280
- this . #waitingToCheckpoint. add ( runId ) ;
281
- await setTimeout ( delayMs ) ;
282
-
283
- if ( ! this . #waitingToCheckpoint. has ( runId ) ) {
334
+ try {
335
+ await setTimeout ( delayMs , undefined , { signal } ) ;
336
+ } catch ( error ) {
284
337
this . #logger. log ( "Checkpoint canceled during initial delay" , { runId } ) ;
285
338
return { success : false , reason : "CANCELED" } ;
286
- } else {
287
- this . #waitingToCheckpoint. delete ( runId ) ;
288
339
}
289
340
}
290
341
@@ -310,24 +361,24 @@ export class Checkpointer {
310
361
delay,
311
362
} ) ;
312
363
313
- this . #waitingToCheckpoint. add ( runId ) ;
314
- await setTimeout ( delay . milliseconds ) ;
315
-
316
- if ( ! this . #waitingToCheckpoint. has ( runId ) ) {
317
- this . #logger. log ( "Checkpoint canceled while waiting for retry" , { runId } ) ;
364
+ try {
365
+ await setTimeout ( delay . milliseconds , undefined , { signal } ) ;
366
+ } catch ( error ) {
367
+ this . #logger. log ( "Checkpoint canceled during retry delay" , { runId } ) ;
318
368
return { success : false , reason : "CANCELED" } ;
319
- } else {
320
- this . #waitingToCheckpoint. delete ( runId ) ;
321
369
}
322
370
}
323
371
324
- const result = await this . #checkpointAndPush( {
325
- runId,
326
- leaveRunning,
327
- projectRef,
328
- deploymentVersion,
329
- attemptNumber,
330
- } ) ;
372
+ const result = await this . #checkpointAndPush(
373
+ {
374
+ runId,
375
+ leaveRunning,
376
+ projectRef,
377
+ deploymentVersion,
378
+ attemptNumber,
379
+ } ,
380
+ { signal }
381
+ ) ;
331
382
332
383
if ( result . success ) {
333
384
return result ;
@@ -339,24 +390,6 @@ export class Checkpointer {
339
390
return result ;
340
391
}
341
392
342
- if ( result . reason === "IN_PROGRESS" ) {
343
- this . #logger. log ( "Checkpoint already in progress, won't retry" , { runId } ) ;
344
- this . #failCheckpoint( runId , result . reason ) ;
345
- return result ;
346
- }
347
-
348
- if ( result . reason === "NO_SUPPORT" ) {
349
- this . #logger. log ( "No checkpoint support, won't retry" , { runId } ) ;
350
- this . #failCheckpoint( runId , result . reason ) ;
351
- return result ;
352
- }
353
-
354
- if ( result . reason === "DISABLED" ) {
355
- this . #logger. log ( "Checkpoint support disabled, won't retry" , { runId } ) ;
356
- this . #failCheckpoint( runId , result . reason ) ;
357
- return result ;
358
- }
359
-
360
393
if ( result . reason === "SKIP_RETRYING" ) {
361
394
this . #logger. log ( "Skipping retrying" , { runId } ) ;
362
395
return result ;
@@ -384,13 +417,16 @@ export class Checkpointer {
384
417
return { success : false , reason : "ERROR" } ;
385
418
}
386
419
387
- async #checkpointAndPush( {
388
- runId,
389
- leaveRunning = true , // This mirrors kubernetes behaviour more accurately
390
- projectRef,
391
- deploymentVersion,
392
- attemptNumber,
393
- } : CheckpointAndPushOptions ) : Promise < CheckpointAndPushResult > {
420
+ async #checkpointAndPush(
421
+ {
422
+ runId,
423
+ leaveRunning = true , // This mirrors kubernetes behaviour more accurately
424
+ projectRef,
425
+ deploymentVersion,
426
+ attemptNumber,
427
+ } : CheckpointAndPushOptions ,
428
+ { signal } : { signal : AbortSignal }
429
+ ) : Promise < CheckpointAndPushResult > {
394
430
await this . init ( ) ;
395
431
396
432
const options = {
@@ -401,47 +437,12 @@ export class Checkpointer {
401
437
attemptNumber,
402
438
} ;
403
439
404
- if ( ! this . #dockerMode && ! this . #canCheckpoint) {
405
- this . #logger. error ( "No checkpoint support. Simulation requires docker." ) ;
406
- return { success : false , reason : "NO_SUPPORT" } ;
407
- }
408
-
409
- if ( this . #isCheckpointing( runId ) ) {
410
- this . #logger. error ( "Checkpoint procedure already in progress" , { options } ) ;
411
- return { success : false , reason : "IN_PROGRESS" } ;
412
- }
413
-
414
- // This is a new checkpoint, clear any last failure for this run
415
- this . #clearFailedCheckpoint( runId ) ;
416
-
417
- if ( this . disableCheckpointSupport ) {
418
- this . #logger. error ( "Checkpoint support disabled" , { options } ) ;
419
- return { success : false , reason : "DISABLED" } ;
420
- }
421
-
422
- const controller = new AbortController ( ) ;
423
- this . #abortControllers. set ( runId , controller ) ;
424
-
425
- const onAbort = ( ) => {
426
- this . #logger. error ( "Checkpoint aborted" , { options } ) ;
427
- controller . signal . removeEventListener ( "abort" , onAbort ) ;
428
- } ;
429
- controller . signal . addEventListener ( "abort" , onAbort ) ;
430
-
431
440
const shortCode = nanoid ( 8 ) ;
432
441
const imageRef = this . #getImageRef( projectRef , deploymentVersion , shortCode ) ;
433
442
const exportLocation = this . #getExportLocation( projectRef , deploymentVersion , shortCode ) ;
434
443
435
- const buildah = new Buildah ( { id : `${ runId } -${ shortCode } ` , abortSignal : controller . signal } ) ;
436
- const crictl = new Crictl ( { id : `${ runId } -${ shortCode } ` , abortSignal : controller . signal } ) ;
437
-
438
- const removeCurrentAbortController = ( ) => {
439
- // Ensure only the current controller is removed
440
- if ( this . #abortControllers. get ( runId ) === controller ) {
441
- this . #abortControllers. delete ( runId ) ;
442
- }
443
- controller . signal . removeEventListener ( "abort" , onAbort ) ;
444
- } ;
444
+ const buildah = new Buildah ( { id : `${ runId } -${ shortCode } ` , abortSignal : signal } ) ;
445
+ const crictl = new Crictl ( { id : `${ runId } -${ shortCode } ` , abortSignal : signal } ) ;
445
446
446
447
const cleanup = async ( ) => {
447
448
const metadata = {
@@ -452,7 +453,6 @@ export class Checkpointer {
452
453
453
454
if ( this . #dockerMode) {
454
455
this . #logger. debug ( "Skipping cleanup in docker mode" , metadata ) ;
455
- removeCurrentAbortController ( ) ;
456
456
return ;
457
457
}
458
458
@@ -464,28 +464,26 @@ export class Checkpointer {
464
464
} catch ( error ) {
465
465
this . #logger. error ( "Error during cleanup" , { ...metadata , error } ) ;
466
466
}
467
-
468
- removeCurrentAbortController ( ) ;
469
467
} ;
470
468
471
469
try {
472
470
await this . chaosMonkey . call ( ) ;
473
471
474
- this . #logger. log ( "Checkpointing: " , { options } ) ;
472
+ this . #logger. log ( "checkpointAndPush: checkpointing " , { options } ) ;
475
473
476
474
const containterName = this . #getRunContainerName( runId ) ;
477
475
478
476
// Create checkpoint (docker)
479
477
if ( this . #dockerMode) {
480
478
await this . #createDockerCheckpoint(
481
- controller . signal ,
479
+ signal ,
482
480
runId ,
483
481
exportLocation ,
484
482
leaveRunning ,
485
483
attemptNumber
486
484
) ;
487
485
488
- this . #logger. log ( "checkpoint created: " , {
486
+ this . #logger. log ( "checkpointAndPush: checkpoint created" , {
489
487
runId,
490
488
location : exportLocation ,
491
489
} ) ;
@@ -586,13 +584,16 @@ export class Checkpointer {
586
584
}
587
585
}
588
586
589
- this . #logger. error ( "Unhandled checkpoint error" , { options, error } ) ;
587
+ this . #logger. error ( "Unhandled checkpoint error" , {
588
+ options,
589
+ error : error instanceof Error ? error . message : error ,
590
+ } ) ;
590
591
591
592
return { success : false , reason : "ERROR" } ;
592
593
} finally {
593
594
await cleanup ( ) ;
594
595
595
- if ( controller . signal . aborted ) {
596
+ if ( signal . aborted ) {
596
597
this . #logger. error ( "Checkpoint canceled: Cleanup" , { options } ) ;
597
598
598
599
// Overrides any prior return value
0 commit comments