@@ -90,10 +90,10 @@ func ManageJobResources() error {
90
90
return err
91
91
}
92
92
93
- k8sJobMap := map [string ]* kbatch.Job {}
93
+ k8sJobMap := map [string ]kbatch.Job {}
94
94
k8sJobIDSet := strset.Set {}
95
95
for _ , kJob := range jobs {
96
- k8sJobMap [kJob .Labels ["jobID" ]] = & kJob
96
+ k8sJobMap [kJob .Labels ["jobID" ]] = kJob
97
97
k8sJobIDSet .Add (kJob .Labels ["jobID" ])
98
98
}
99
99
@@ -103,7 +103,7 @@ func ManageJobResources() error {
103
103
queueURL = pointer .String (queueURLMap [jobKey .ID ])
104
104
}
105
105
106
- k8sJob := k8sJobMap [jobKey .ID ]
106
+ k8sJob , jobFound := k8sJobMap [jobKey .ID ]
107
107
108
108
jobLogger , err := operator .GetJobLogger (jobKey )
109
109
if err != nil {
@@ -135,7 +135,7 @@ func ManageJobResources() error {
135
135
continue
136
136
}
137
137
138
- newStatusCode , msg , err := reconcileInProgressJob (jobState , queueURL , k8sJob )
138
+ newStatusCode , msg , err := reconcileInProgressJob (jobState , queueURL , jobFound )
139
139
if err != nil {
140
140
telemetry .Error (err )
141
141
operatorLogger .Error (err )
@@ -150,7 +150,7 @@ func ManageJobResources() error {
150
150
continue
151
151
}
152
152
}
153
- if queueURL == nil || k8sJob == nil {
153
+ if queueURL == nil {
154
154
// job has been submitted within the grace period, it may take a while for a newly created queues and jobs to show up in list results
155
155
continue
156
156
}
@@ -249,7 +249,7 @@ func ManageJobResources() error {
249
249
}
250
250
251
251
// verifies that queue exists for an in progress job and k8s job exists for a job in running status, if verification fails return the a job code to reflect the state
252
- func reconcileInProgressJob (jobState * job.State , queueURL * string , k8sJob * kbatch. Job ) (status.JobCode , string , error ) {
252
+ func reconcileInProgressJob (jobState * job.State , queueURL * string , jobFound bool ) (status.JobCode , string , error ) {
253
253
jobKey := jobState .JobKey
254
254
255
255
if queueURL == nil {
@@ -275,45 +275,49 @@ func reconcileInProgressJob(jobState *job.State, queueURL *string, k8sJob *kbatc
275
275
return jobState .Status , "" , nil
276
276
}
277
277
278
- if k8sJob == nil { // unexpected k8s job missing
278
+ if ! jobFound { // unexpected k8s job missing
279
279
return status .JobUnexpectedError , fmt .Sprintf ("terminating job %s; unable to find kubernetes job" , jobKey .UserString ()), nil
280
280
}
281
281
}
282
282
283
283
return jobState .Status , "" , nil
284
284
}
285
285
286
- func checkIfJobCompleted (jobState * job.State , queueURL string , k8sJob * kbatch.Job ) error {
286
+ func checkIfJobCompleted (jobState * job.State , queueURL string , k8sJob kbatch.Job ) error {
287
287
jobKey := jobState .JobKey
288
288
289
289
jobFailed , err := checkForJobFailure (jobKey , k8sJob )
290
290
if err != nil || jobFailed {
291
291
return err
292
292
}
293
293
294
- queueMessages , err := getQueueMetricsFromURL ( queueURL )
294
+ jobLogger , err := operator . GetJobLogger ( jobKey )
295
295
if err != nil {
296
296
return err
297
297
}
298
298
299
- jobLogger , err := operator .GetJobLogger (jobKey )
299
+ // job is still in-progress
300
+ if int (k8sJob .Status .Active ) != 0 {
301
+ return nil
302
+ }
303
+
304
+ queueMessages , err := getQueueMetricsFromURL (queueURL )
300
305
if err != nil {
301
306
return err
302
307
}
303
308
304
309
if ! queueMessages .IsEmpty () {
305
310
// Give time for queue metrics to reach consistency
306
- if k8sJob != nil && int (k8sJob .Status .Active ) == 0 {
307
- if _jobsToDelete .Has (jobKey .ID ) {
308
- _jobsToDelete .Remove (jobKey .ID )
309
- jobLogger .Error ("unexpected job status because cluster state indicates job has completed but metrics indicate that job is still in progress" )
310
- return errors .FirstError (
311
- job .SetUnexpectedErrorStatus (jobKey ),
312
- deleteJobRuntimeResources (jobKey ),
313
- )
314
- }
315
- _jobsToDelete .Add (jobKey .ID )
311
+ if _jobsToDelete .Has (jobKey .ID ) {
312
+ _jobsToDelete .Remove (jobKey .ID )
313
+ jobLogger .Error ("unexpected job status because cluster state indicates job has completed but metrics indicate that job is still in progress" )
314
+ return errors .FirstError (
315
+ job .SetUnexpectedErrorStatus (jobKey ),
316
+ deleteJobRuntimeResources (jobKey ),
317
+ )
316
318
}
319
+ _jobsToDelete .Add (jobKey .ID )
320
+
317
321
return nil
318
322
}
319
323
@@ -356,7 +360,7 @@ func checkIfJobCompleted(jobState *job.State, queueURL string, k8sJob *kbatch.Jo
356
360
return nil
357
361
}
358
362
359
- func checkForJobFailure (jobKey spec.JobKey , k8sJob * kbatch.Job ) (bool , error ) {
363
+ func checkForJobFailure (jobKey spec.JobKey , k8sJob kbatch.Job ) (bool , error ) {
360
364
jobLogger , err := operator .GetJobLogger (jobKey )
361
365
if err != nil {
362
366
return false , err
@@ -372,7 +376,7 @@ func checkForJobFailure(jobKey spec.JobKey, k8sJob *kbatch.Job) (bool, error) {
372
376
deleteJobRuntimeResources (jobKey ),
373
377
)
374
378
}
375
- if k8sJob != nil && int (k8sJob .Status .Failed ) > 0 {
379
+ if int (k8sJob .Status .Failed ) > 0 {
376
380
podStatus := k8s .GetPodStatus (& pod )
377
381
for _ , containerStatus := range pod .Status .ContainerStatuses {
378
382
if containerStatus .LastTerminationState .Terminated != nil {
@@ -394,9 +398,6 @@ func checkForJobFailure(jobKey spec.JobKey, k8sJob *kbatch.Job) (bool, error) {
394
398
}
395
399
}
396
400
397
- if k8sJob == nil {
398
- return false , nil
399
- }
400
401
if int (k8sJob .Status .Failed ) > 0 {
401
402
if ! reasonFound {
402
403
jobLogger .Error ("workers were killed for unknown reason" )
@@ -405,12 +406,6 @@ func checkForJobFailure(jobKey spec.JobKey, k8sJob *kbatch.Job) (bool, error) {
405
406
job .SetWorkerErrorStatus (jobKey ),
406
407
deleteJobRuntimeResources (jobKey ),
407
408
)
408
- } else if int (k8sJob .Status .Active ) == 0 && int (k8sJob .Status .Failed ) == 0 && len (pods ) == 0 {
409
- // really unexpected situation which doesn't hurt if we check
410
- return true , errors .FirstError (
411
- job .SetUnexpectedErrorStatus (jobKey ),
412
- deleteJobRuntimeResources (jobKey ),
413
- )
414
409
}
415
410
416
411
return false , nil
0 commit comments