@@ -1178,10 +1178,14 @@ func (r *Registry) cleanupOldJobs(ctx context.Context, olderThan time.Time) erro
11781178 var done bool
11791179 var err error
11801180 done , maxID , err = r .cleanupOldJobsPage (ctx , olderThan , maxID , cleanupPageSize )
1181- if err != nil || done {
1181+ if err != nil {
11821182 return err
11831183 }
1184+ if done {
1185+ break
1186+ }
11841187 }
1188+ return r .CleanupCorruptJobs (ctx )
11851189}
11861190
11871191// AbandonedJobInfoRowsCleanupQuery is used by the CLI command
@@ -1318,6 +1322,36 @@ func (r *Registry) cleanupOldJobsPage(
13181322 return ! morePages , maxID , nil
13191323}
13201324
1325+ const findCorruptJobsQuery = `
1326+ SELECT id
1327+ FROM system.jobs
1328+ LEFT JOIN system.job_info ON system.jobs.id = system.job_info.job_id
1329+ WHERE system.job_info.job_id IS NULL AND system.jobs.job_type = 'AUTO SQL STATS COMPACTION'
1330+ `
1331+
1332+ // CleanupCorruptJobs is a temporary cleanup function that deletes corrupt
1333+ // `AUTO SQL STATS COMPACTION` jobs. This function exists to clean up after
1334+ // #155165.
1335+ //
1336+ // TODO(jeffswenson): in a separate PR we should run this as a migration so we
1337+ // can guarantee the issue was cleaned up as of a specific version.
1338+ func (r * Registry ) CleanupCorruptJobs (ctx context.Context ) error {
1339+ return r .db .Txn (ctx , func (ctx context.Context , txn isql.Txn ) error {
1340+ datums , err := txn .QueryBuffered (ctx , "get-corrupt-jobs" , txn .KV (), findCorruptJobsQuery )
1341+ if err != nil {
1342+ return errors .Wrap (err , "querying for broken sql activity stats compaction jobs" )
1343+ }
1344+ for _ , row := range datums {
1345+ id := jobspb .JobID (tree .MustBeDInt (row [0 ]))
1346+ log .Dev .Errorf (ctx , "resetting broken sql activity stats compaction job %d" , id )
1347+ if err := r .deleteJob (ctx , txn , id ); err != nil {
1348+ return errors .Wrapf (err , "deleting broken sql activity stats compaction job %d" , id )
1349+ }
1350+ }
1351+ return nil
1352+ })
1353+ }
1354+
13211355// DeleteTerminalJobByID deletes the given job ID if it is in a
13221356// terminal state. If it is is in a non-terminal state, an error is
13231357// returned. This API should not be used.
@@ -1334,36 +1368,40 @@ func (r *Registry) DeleteTerminalJobByID(ctx context.Context, id jobspb.JobID) e
13341368 state := State (* row [0 ].(* tree.DString ))
13351369 switch state {
13361370 case StateSucceeded , StateCanceled , StateFailed :
1337- _ , err := txn .Exec (
1338- ctx , "delete-job" , txn .KV (), "DELETE FROM system.jobs WHERE id = $1" , id ,
1339- )
1371+ return r .deleteJob (ctx , txn , id )
1372+ default :
1373+ return errors .Newf ("job %d has non-terminal state: %q" , id , state )
1374+ }
1375+ })
1376+ }
1377+
1378+ func (r * Registry ) deleteJob (ctx context.Context , txn isql.Txn , id jobspb.JobID ) error {
1379+ _ , err := txn .Exec (
1380+ ctx , "delete-job" , txn .KV (), "DELETE FROM system.jobs WHERE id = $1" , id ,
1381+ )
1382+ if err != nil {
1383+ return err
1384+ }
1385+ for i , tbl := range jobMetadataTables {
1386+ if i > 0 {
1387+ v , err := txn .GetSystemSchemaVersion (ctx )
13401388 if err != nil {
13411389 return err
13421390 }
1343- for i , tbl := range jobMetadataTables {
1344- if i > 0 {
1345- v , err := txn .GetSystemSchemaVersion (ctx )
1346- if err != nil {
1347- return err
1348- }
1349- if v .Less (clusterversion .V25_1_AddJobsTables .Version ()) {
1350- break
1351- }
1352- }
1353-
1354- _ , err = txn .Exec (
1355- ctx , redact .RedactableString ("delete-job-" + tbl ), txn .KV (),
1356- "DELETE FROM system." + tbl + " WHERE job_id = $1" , id ,
1357- )
1358- if err != nil {
1359- return err
1360- }
1391+ if v .Less (clusterversion .V25_1_AddJobsTables .Version ()) {
1392+ break
13611393 }
1362- return nil
1363- default :
1364- return errors .Newf ("job %d has non-terminal state: %q" , id , state )
13651394 }
1366- })
1395+
1396+ _ , err = txn .Exec (
1397+ ctx , redact .RedactableString ("delete-job-" + tbl ), txn .KV (),
1398+ "DELETE FROM system." + tbl + " WHERE job_id = $1" , id ,
1399+ )
1400+ if err != nil {
1401+ return err
1402+ }
1403+ }
1404+ return nil
13671405}
13681406
13691407// PauseRequested marks the job with id as paused-requested using the specified txn (may be nil).
0 commit comments