Skip to content

Commit 06985bf

Browse files
authored
[autorevert] fix handling for insufficient successes (#7235)
Previously the code was trying to group branches for restarts resulting from "infra check" and from "insufficient events", and this was a mistake, resulting in delayed restarts. Specifically, in this situation: <img width="999" height="747" alt="image" src="https://github.com/user-attachments/assets/9cd0051e-8d87-4fe2-af90-88a776847c4d" /> a restart on the success side is expected, but the system waits for pending job on the failure side. This PR decouples and simplifies the logic. Now, all restarts are scheduled independently (relying on set deduplication) and all final checks are performed afterwards. Added a unit test to specifically verify the case above.
1 parent d2b0c00 commit 06985bf

File tree

2 files changed

+83
-31
lines changed

2 files changed

+83
-31
lines changed

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/signal.py

Lines changed: 40 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ class IneligibleReason(Enum):
5151
NO_PARTITION = "no_partition" # insufficient commit history to form partitions
5252
INFRA_NOT_CONFIRMED = "infra_not_confirmed" # infra check not confirmed
5353
INSUFFICIENT_FAILURES = "insufficient_failures" # not enough failures to make call
54+
INSUFFICIENT_SUCCESSES = (
55+
"insufficient_successes" # not enough successes to make call
56+
)
5457
PENDING_GAP = "pending_gap" # unknown/pending commits present
5558

5659

@@ -366,41 +369,39 @@ def process_valid_autorevert_pattern(
366369
if not c.events:
367370
restart_commits.add(c.head_sha)
368371

372+
# infra check section
373+
369374
infra_check_result = partition.confirm_not_an_infra_issue()
370-
# note re: event_count < 3:
371-
# this is a confidence heuristic to detect flakiness, can adjust as needed
372375
if (
373376
infra_check_result == InfraCheckResult.RESTART_FAILURE
374-
or partition.failure_events_count() < 3
377+
and not partition.failed[-1].has_pending
375378
):
376-
if not partition.failed[-1].has_pending:
377-
# restarting oldest failed
378-
restart_commits.add(partition.failed[-1].head_sha)
379-
else:
380-
if infra_check_result == InfraCheckResult.RESTART_FAILURE:
381-
return Ineligible(
382-
IneligibleReason.INFRA_NOT_CONFIRMED,
383-
f"waiting on pending events on suspected failure side: {partition.failed[-1].head_sha}",
384-
)
385-
else:
386-
return Ineligible(
387-
IneligibleReason.INSUFFICIENT_FAILURES,
388-
f"insufficient failures to make call, "
389-
f"pending events on suspected failure side: {partition.failed[-1].head_sha}",
390-
)
379+
# restarting oldest failed
380+
restart_commits.add(partition.failed[-1].head_sha)
391381

392382
if (
393383
infra_check_result == InfraCheckResult.RESTART_SUCCESS
394-
or partition.success_events_count() < 2
384+
and not partition.successful[0].has_pending
395385
):
396-
if not partition.successful[0].has_pending:
397-
# restarting newest successful
398-
restart_commits.add(partition.successful[0].head_sha)
399-
else:
400-
return Ineligible(
401-
IneligibleReason.INFRA_NOT_CONFIRMED,
402-
f"waiting on pending events on suspected success side: {partition.successful[0].head_sha}",
403-
)
386+
# restarting newest successful
387+
restart_commits.add(partition.successful[0].head_sha)
388+
389+
# additional heuristics to reduce uncertainty / flakiness
390+
REQUIRE_FAILED_EVENTS = 3
391+
REQUIRE_SUCCESS_EVENTS = 2
392+
393+
# this is a confidence heuristic to detect flakiness, can adjust the number of events as needed
394+
if (
395+
partition.failure_events_count() < REQUIRE_FAILED_EVENTS
396+
and not partition.failed[-1].has_pending
397+
):
398+
restart_commits.add(partition.failed[-1].head_sha)
399+
400+
if (
401+
partition.success_events_count() < REQUIRE_SUCCESS_EVENTS
402+
and not partition.successful[0].has_pending
403+
):
404+
restart_commits.add(partition.successful[0].head_sha)
404405

405406
if restart_commits:
406407
return RestartCommits(commit_shas=restart_commits)
@@ -411,6 +412,18 @@ def process_valid_autorevert_pattern(
411412
f"infra check result: {infra_check_result.value}",
412413
)
413414

415+
if partition.failure_events_count() < REQUIRE_FAILED_EVENTS:
416+
return Ineligible(
417+
IneligibleReason.INSUFFICIENT_FAILURES,
418+
f"not enough failures to make call: {partition.failure_events_count()}",
419+
)
420+
421+
if partition.success_events_count() < REQUIRE_SUCCESS_EVENTS:
422+
return Ineligible(
423+
IneligibleReason.INSUFFICIENT_SUCCESSES,
424+
f"not enough successes to make call: {partition.success_events_count()}",
425+
)
426+
414427
if partition.unknown:
415428
# there are still pending/missing commits in the unknown partition
416429
unknown_shas = ", ".join(c.head_sha for c in partition.unknown)

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/tests/test_signal.py

Lines changed: 43 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,6 @@ def test_insufficient_failures_returns_insufficient_failures_when_pending_on_fai
271271
res = s.process_valid_autorevert_pattern()
272272
self.assertIsInstance(res, Ineligible)
273273
self.assertEqual(res.reason, IneligibleReason.INSUFFICIENT_FAILURES)
274-
self.assertIn("sha_failed_pend", res.message)
275274

276275
def test_insufficient_successes_returns_restart_newest_success_when_no_pending(
277276
self,
@@ -307,8 +306,10 @@ def test_insufficient_successes_returns_restart_newest_success_when_no_pending(
307306
self.assertTrue(hasattr(res, "commit_shas"))
308307
self.assertIn("sha_success", res.commit_shas)
309308

310-
def test_insufficient_successes_infra_not_confirmed_when_pending_on_success(self):
311-
# Three failures present, but newest successful has pending → ineligible awaiting confirmation
309+
def test_insufficient_successes_returns_specific_reason_when_pending_on_success(
310+
self,
311+
):
312+
# Three failures present, but newest successful has pending → ineligible due to insufficient successes
312313
c_fail_newest = SignalCommit(
313314
head_sha="sha_fail_newest",
314315
events=[self._ev("job", SignalStatus.FAILURE, 11)],
@@ -335,8 +336,9 @@ def test_insufficient_successes_infra_not_confirmed_when_pending_on_success(self
335336
)
336337
res = s.process_valid_autorevert_pattern()
337338
self.assertIsInstance(res, Ineligible)
339+
# Should be ineligible due to insufficient successes, but infra check takes precedence and
340+
# it currently requires two successes to confirm not infra
338341
self.assertEqual(res.reason, IneligibleReason.INFRA_NOT_CONFIRMED)
339-
self.assertIn("sha_success_pend", res.message)
340342

341343
def test_both_sides_restart_accumulate_when_below_thresholds(self):
342344
# One failure total (<3) and one success total (<2), neither pending.
@@ -360,6 +362,43 @@ def test_both_sides_restart_accumulate_when_below_thresholds(self):
360362
self.assertIn("sha_failed", res.commit_shas)
361363
self.assertIn("sha_success", res.commit_shas)
362364

365+
def test_success_restart_even_when_failed_side_pending_and_insufficient_failures(
366+
self,
367+
):
368+
# Scenario:
369+
# - Only one failed event on the failed side, and that failed commit also has a pending event
370+
# - Success side has successes that are earlier than failure (so infra check yields RESTART_SUCCESS)
371+
# Expected: restart is still proposed on the success side (due to infra check),
372+
# even though failures < 3 and the failed commit is pending.
373+
374+
# Failed (newer): has PENDING and then FAILURE
375+
c_failed_pending = SignalCommit(
376+
head_sha="sha_fail_pend",
377+
events=[
378+
self._ev("job", SignalStatus.PENDING, 5),
379+
self._ev("job", SignalStatus.FAILURE, 6),
380+
],
381+
)
382+
# Successful (older): two successes earlier than any failure/pending, not pending
383+
c_success_ok = SignalCommit(
384+
head_sha="sha_success_ok",
385+
events=[
386+
self._ev("job", SignalStatus.SUCCESS, 2),
387+
self._ev("job", SignalStatus.SUCCESS, 4),
388+
],
389+
)
390+
s = Signal(
391+
key="job",
392+
workflow_name="wf",
393+
commits=[c_failed_pending, c_success_ok], # newest -> older
394+
)
395+
res = s.process_valid_autorevert_pattern()
396+
# Should be a RestartCommits proposing restart on the success side only
397+
self.assertNotIsInstance(res, AutorevertPattern)
398+
self.assertTrue(hasattr(res, "commit_shas"))
399+
self.assertIn("sha_success_ok", res.commit_shas)
400+
self.assertNotIn("sha_fail_pend", res.commit_shas)
401+
363402

364403
if __name__ == "__main__":
365404
unittest.main()

0 commit comments

Comments
 (0)