Skip to content

Commit ec6a206

Browse files
izaitsevfbmalfet
andauthored
[autorevert] fix handling of insufficient signals for restarts (#7209)
A bug introduced by #7204 The check that ensures that at least two failure/success signals are available before revert could be issued was broken. This PR restores. Here's an example of the wrong behavior: [2025-09-23_22-44-14.html](https://github.com/user-attachments/files/22503454/2025-09-23_22-44-14.html) --------- Co-authored-by: Nikita Shulga <[email protected]>
1 parent 744b626 commit ec6a206

File tree

2 files changed

+124
-10
lines changed

2 files changed

+124
-10
lines changed

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/signal.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -368,18 +368,30 @@ def process_valid_autorevert_pattern(
368368
infra_check_result = partition.confirm_not_an_infra_issue()
369369
# note re: event_count < 2:
370370
# this is a confidence heuristic to detect flakiness, can adjust as needed
371-
if infra_check_result == InfraCheckResult.RESTART_FAILURE or (
372-
partition.failure_events_count() < 2
373-
and not partition.failed[-1].has_pending
371+
if (
372+
infra_check_result == InfraCheckResult.RESTART_FAILURE
373+
or partition.failure_events_count() < 2
374374
):
375-
# restarting oldest failed
376-
restart_commits.add(partition.failed[-1].head_sha)
377-
elif infra_check_result == InfraCheckResult.RESTART_SUCCESS or (
378-
partition.success_events_count() < 2
379-
and not partition.successful[0].has_pending
375+
if not partition.failed[-1].has_pending:
376+
# restarting oldest failed
377+
restart_commits.add(partition.failed[-1].head_sha)
378+
else:
379+
return Ineligible(
380+
IneligibleReason.INFRA_NOT_CONFIRMED,
381+
f"waiting on pending events on suspected failure side: {partition.failed[-1].head_sha}",
382+
)
383+
elif (
384+
infra_check_result == InfraCheckResult.RESTART_SUCCESS
385+
or partition.success_events_count() < 2
380386
):
381-
# restarting newest successful
382-
restart_commits.add(partition.successful[0].head_sha)
387+
if not partition.successful[0].has_pending:
388+
# restarting newest successful
389+
restart_commits.add(partition.successful[0].head_sha)
390+
else:
391+
return Ineligible(
392+
IneligibleReason.INFRA_NOT_CONFIRMED,
393+
f"waiting on pending events on suspected success side: {partition.successful[0].head_sha}",
394+
)
383395

384396
if restart_commits:
385397
return RestartCommits(commit_shas=restart_commits)

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/tests/test_signal.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,108 @@ def test_detect_autorevert_pattern_ineligible_when_fixed(self):
222222
self.assertIsInstance(res, Ineligible)
223223
self.assertEqual(res.reason, IneligibleReason.FIXED)
224224

225+
def test_insufficient_failures_returns_restart_oldest_failed_when_no_pending(self):
226+
# Only one failure event overall (< 2) -> suggest restart of oldest failed
227+
c_failed = SignalCommit(
228+
head_sha="sha_failed",
229+
events=[self._ev("job", SignalStatus.FAILURE, 5)],
230+
)
231+
# Base successful commit with enough successes to avoid infra ambiguity
232+
c_base = SignalCommit(
233+
head_sha="sha_base",
234+
events=[
235+
self._ev("job", SignalStatus.SUCCESS, 3),
236+
self._ev("job", SignalStatus.SUCCESS, 7),
237+
],
238+
)
239+
s = Signal(key="job", workflow_name="wf", commits=[c_failed, c_base])
240+
res = s.process_valid_autorevert_pattern()
241+
self.assertIsNotNone(res)
242+
# With insufficient failures, we never produce an AutorevertPattern
243+
self.assertNotIsInstance(res, AutorevertPattern)
244+
# Should propose a restart on the suspected failure side (oldest failed)
245+
self.assertTrue(hasattr(res, "commit_shas"))
246+
self.assertIn("sha_failed", res.commit_shas)
247+
248+
def test_insufficient_failures_infra_not_confirmed_when_pending_on_failed(self):
249+
# Only one failure overall, but the failed commit also has pending → ineligible awaiting confirmation
250+
c_failed_pending = SignalCommit(
251+
head_sha="sha_failed_pend",
252+
events=[
253+
self._ev("job", SignalStatus.PENDING, 4),
254+
self._ev("job", SignalStatus.FAILURE, 5),
255+
],
256+
)
257+
c_base = SignalCommit(
258+
head_sha="sha_base",
259+
events=[
260+
self._ev("job", SignalStatus.SUCCESS, 3),
261+
self._ev("job", SignalStatus.SUCCESS, 7),
262+
],
263+
)
264+
s = Signal(key="job", workflow_name="wf", commits=[c_failed_pending, c_base])
265+
res = s.process_valid_autorevert_pattern()
266+
self.assertIsInstance(res, Ineligible)
267+
self.assertEqual(res.reason, IneligibleReason.INFRA_NOT_CONFIRMED)
268+
self.assertIn("sha_failed_pend", res.message)
269+
270+
def test_insufficient_successes_returns_restart_newest_success_when_no_pending(
271+
self,
272+
):
273+
# Ensure we have at least two failure events to avoid the failure-side heuristic
274+
c_fail_new = SignalCommit(
275+
head_sha="sha_fail_new",
276+
events=[self._ev("job", SignalStatus.FAILURE, 10)],
277+
)
278+
c_fail_old = SignalCommit(
279+
head_sha="sha_fail_old",
280+
events=[self._ev("job", SignalStatus.FAILURE, 9)],
281+
)
282+
# Only one success event overall (< 2) → suggest restart of newest successful
283+
c_success = SignalCommit(
284+
head_sha="sha_success",
285+
events=[self._ev("job", SignalStatus.SUCCESS, 3)],
286+
)
287+
s = Signal(
288+
key="job",
289+
workflow_name="wf",
290+
commits=[c_fail_new, c_fail_old, c_success],
291+
)
292+
res = s.process_valid_autorevert_pattern()
293+
self.assertIsNotNone(res)
294+
# With insufficient successes, we never produce an AutorevertPattern
295+
self.assertNotIsInstance(res, AutorevertPattern)
296+
# Should propose a restart on the suspected success side (newest successful)
297+
self.assertTrue(hasattr(res, "commit_shas"))
298+
self.assertIn("sha_success", res.commit_shas)
299+
300+
def test_insufficient_successes_infra_not_confirmed_when_pending_on_success(self):
301+
# Two failures present, but newest successful has pending → ineligible awaiting confirmation
302+
c_fail_new = SignalCommit(
303+
head_sha="sha_fail_new",
304+
events=[self._ev("job", SignalStatus.FAILURE, 10)],
305+
)
306+
c_fail_old = SignalCommit(
307+
head_sha="sha_fail_old",
308+
events=[self._ev("job", SignalStatus.FAILURE, 9)],
309+
)
310+
c_success_pending = SignalCommit(
311+
head_sha="sha_success_pend",
312+
events=[
313+
self._ev("job", SignalStatus.SUCCESS, 3),
314+
self._ev("job", SignalStatus.PENDING, 8),
315+
],
316+
)
317+
s = Signal(
318+
key="job",
319+
workflow_name="wf",
320+
commits=[c_fail_new, c_fail_old, c_success_pending],
321+
)
322+
res = s.process_valid_autorevert_pattern()
323+
self.assertIsInstance(res, Ineligible)
324+
self.assertEqual(res.reason, IneligibleReason.INFRA_NOT_CONFIRMED)
325+
self.assertIn("sha_success_pend", res.message)
326+
225327

226328
if __name__ == "__main__":
227329
unittest.main()

0 commit comments

Comments
 (0)