Skip to content

Commit 6ec1bf7

Browse files
authored
[AUTOREVERT] Add retry with back-off for GH API and CH (#7241)
Just going on the code, finding where we call external API, and adding a retry with exponential back-off. Defaults to 5 retries, 0.5s base and with 10% jitter There are NO CODE CHANGES, all parts of the code that are relevant are being guardrailed with: ``` for attempt in RetryWithBackoff(): with attempt: # the code ``` Changes appear to be big due: * Extra tabs and the consequent linter changes * Lazy nature of the gh and ch libraries, that resolve pagination as the code consume information
1 parent 06985bf commit 6ec1bf7

File tree

8 files changed

+415
-285
lines changed

8 files changed

+415
-285
lines changed

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/autorevert_checker.py

Lines changed: 82 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from typing import Dict, Iterable, List, Optional, Set, Tuple
1212

1313
from .clickhouse_client_helper import CHCliFactory
14+
from .utils import RetryWithBackoff
1415

1516

1617
@dataclass
@@ -160,45 +161,47 @@ def _fetch_workflow_data(self):
160161
wf.workflow_name, push_dedup.timestamp DESC, wf.head_sha, wf.name
161162
"""
162163

163-
result = CHCliFactory().client.query(
164-
query,
165-
parameters={
166-
"workflow_names": self.workflow_names,
167-
"lookback_time": lookback_time,
168-
},
169-
)
170-
171-
# Group by workflow and commit SHA
172-
workflow_commits_data = {}
173-
for row in result.result_rows:
174-
(
175-
workflow_name,
176-
head_sha,
177-
name,
178-
conclusion,
179-
status,
180-
classification_rule,
181-
created_at,
182-
) = row
183-
184-
if workflow_name not in workflow_commits_data:
185-
workflow_commits_data[workflow_name] = {}
186-
187-
if head_sha not in workflow_commits_data[workflow_name]:
188-
workflow_commits_data[workflow_name][head_sha] = CommitJobs(
189-
head_sha=head_sha, created_at=created_at, jobs=[]
164+
for attempt in RetryWithBackoff():
165+
with attempt:
166+
result = CHCliFactory().client.query(
167+
query,
168+
parameters={
169+
"workflow_names": self.workflow_names,
170+
"lookback_time": lookback_time,
171+
},
190172
)
191173

192-
workflow_commits_data[workflow_name][head_sha].jobs.append(
193-
JobResult(
194-
head_sha=head_sha,
195-
name=name,
196-
conclusion=conclusion,
197-
status=status,
198-
classification_rule=classification_rule or "",
199-
workflow_created_at=created_at,
200-
)
201-
)
174+
# Group by workflow and commit SHA
175+
workflow_commits_data = {}
176+
for row in result.result_rows:
177+
(
178+
workflow_name,
179+
head_sha,
180+
name,
181+
conclusion,
182+
status,
183+
classification_rule,
184+
created_at,
185+
) = row
186+
187+
if workflow_name not in workflow_commits_data:
188+
workflow_commits_data[workflow_name] = {}
189+
190+
if head_sha not in workflow_commits_data[workflow_name]:
191+
workflow_commits_data[workflow_name][head_sha] = CommitJobs(
192+
head_sha=head_sha, created_at=created_at, jobs=[]
193+
)
194+
195+
workflow_commits_data[workflow_name][head_sha].jobs.append(
196+
JobResult(
197+
head_sha=head_sha,
198+
name=name,
199+
conclusion=conclusion,
200+
status=status,
201+
classification_rule=classification_rule or "",
202+
workflow_created_at=created_at,
203+
)
204+
)
202205

203206
# Sort and cache results per workflow
204207
for workflow_name, commits_data in workflow_commits_data.items():
@@ -230,14 +233,16 @@ def _fetch_commit_history(self):
230233
ORDER BY timestamp DESC
231234
"""
232235

233-
result = CHCliFactory().client.query(
234-
query, parameters={"lookback_time": lookback_time}
235-
)
236+
for attempt in RetryWithBackoff():
237+
with attempt:
238+
result = CHCliFactory().client.query(
239+
query, parameters={"lookback_time": lookback_time}
240+
)
236241

237-
return [
238-
{"sha": row[0], "message": row[1], "timestamp": row[2]}
239-
for row in result.result_rows
240-
]
242+
return [
243+
{"sha": row[0], "message": row[1], "timestamp": row[2]}
244+
for row in result.result_rows
245+
]
241246

242247
def _find_last_commit_with_job(
243248
self, commits: Iterable[CommitJobs], job_name: str
@@ -472,18 +477,20 @@ def _fetch_single_commit_jobs(
472477
we = "workflow_dispatch" if restarted_only else "workflow_dispatch"
473478
# Note: for non-restarted we exclude workflow_dispatch via != in WHERE above
474479

475-
result = CHCliFactory().client.query(
476-
query,
477-
parameters={
478-
"workflow_name": workflow_name,
479-
"head_sha": head_sha,
480-
"we": we,
481-
"hb": hb,
482-
"lookback_time": lookback_time,
483-
},
484-
)
480+
for attempt in RetryWithBackoff():
481+
with attempt:
482+
result = CHCliFactory().client.query(
483+
query,
484+
parameters={
485+
"workflow_name": workflow_name,
486+
"head_sha": head_sha,
487+
"we": we,
488+
"hb": hb,
489+
"lookback_time": lookback_time,
490+
},
491+
)
485492

486-
rows = list(result.result_rows)
493+
rows = list(result.result_rows)
487494
if not rows:
488495
return None
489496

@@ -644,24 +651,27 @@ def extract_revert_categories_batch(self, messages: List[str]) -> Dict[str, str]
644651
FROM issue_comment
645652
WHERE id IN {comment_ids:Array(Int64)}
646653
"""
647-
result = CHCliFactory().client.query(
648-
query, parameters={"comment_ids": comment_ids}
649-
)
650654

651-
for row in result.result_rows:
652-
comment_id, body = row
653-
# Look for -c flag in comment body
654-
match = re.search(r"-c\s+(\w+)", body)
655-
if match:
656-
category = match.group(1).lower()
657-
if category in [
658-
"nosignal",
659-
"ignoredsignal",
660-
"landrace",
661-
"weird",
662-
"ghfirst",
663-
]:
664-
comment_id_to_category[comment_id] = category
655+
for attempt in RetryWithBackoff():
656+
with attempt:
657+
result = CHCliFactory().client.query(
658+
query, parameters={"comment_ids": comment_ids}
659+
)
660+
661+
for row in result.result_rows:
662+
comment_id, body = row
663+
# Look for -c flag in comment body
664+
match = re.search(r"-c\s+(\w+)", body)
665+
if match:
666+
category = match.group(1).lower()
667+
if category in [
668+
"nosignal",
669+
"ignoredsignal",
670+
"landrace",
671+
"weird",
672+
"ghfirst",
673+
]:
674+
comment_id_to_category[comment_id] = category
665675
except Exception:
666676
# If query fails, continue without error
667677
pass

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/autorevert_circuit_breaker.py

Lines changed: 37 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import logging
22

33
from .github_client_helper import GHClientFactory
4+
from .utils import RetryWithBackoff
45

56

67
logger = logging.getLogger(__name__)
@@ -17,38 +18,42 @@ def check_autorevert_disabled(repo_full_name: str = "pytorch/pytorch") -> bool:
1718
True if autorevert should be disabled (circuit breaker active), False otherwise
1819
"""
1920
try:
20-
gh_client = GHClientFactory().client
21-
repo = gh_client.get_repo(repo_full_name)
22-
23-
should_disable = False
24-
25-
# Search for open issues with the specific label
26-
disable_issues = repo.get_issues(
27-
state="open", labels=["ci: disable-autorevert"]
28-
)
29-
30-
for issue in disable_issues:
31-
logger.info(
32-
f"Found open issue #{issue.number} with 'ci: disable-autorevert' label "
33-
f"created by user {issue.user.login}. "
34-
f"Autorevert circuit breaker is ACTIVE."
35-
)
36-
should_disable = True
37-
38-
sev_issues = repo.get_issues(state="open", labels=["ci: sev"])
39-
for issue in sev_issues:
40-
logger.info(
41-
f"Found open issue #{issue.number} with 'ci: sev' label "
42-
f"created by user {issue.user.login}. "
43-
f"Autorevert circuit breaker is ACTIVE."
44-
)
45-
should_disable = True
46-
47-
if should_disable:
48-
return True
49-
50-
logger.debug("No open issues with 'ci: disable-autorevert' label found.")
51-
return False
21+
for attempt in RetryWithBackoff():
22+
with attempt:
23+
gh_client = GHClientFactory().client
24+
repo = gh_client.get_repo(repo_full_name)
25+
26+
should_disable = False
27+
28+
# Search for open issues with the specific label
29+
disable_issues = repo.get_issues(
30+
state="open", labels=["ci: disable-autorevert"]
31+
)
32+
33+
for issue in disable_issues:
34+
logger.info(
35+
f"Found open issue #{issue.number} with 'ci: disable-autorevert' label "
36+
f"created by user {issue.user.login}. "
37+
f"Autorevert circuit breaker is ACTIVE."
38+
)
39+
should_disable = True
40+
41+
sev_issues = repo.get_issues(state="open", labels=["ci: sev"])
42+
for issue in sev_issues:
43+
logger.info(
44+
f"Found open issue #{issue.number} with 'ci: sev' label "
45+
f"created by user {issue.user.login}. "
46+
f"Autorevert circuit breaker is ACTIVE."
47+
)
48+
should_disable = True
49+
50+
if should_disable:
51+
return True
52+
53+
logger.debug(
54+
"No open issues with 'ci: disable-autorevert' label found."
55+
)
56+
return False
5257

5358
except Exception as e:
5459
logger.error(f"Error checking autorevert circuit breaker: {e}")

aws/lambda/pytorch-auto-revert/pytorch_auto_revert/run_state_logger.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from .clickhouse_client_helper import CHCliFactory
77
from .signal import AutorevertPattern, Ineligible, RestartCommits, Signal
88
from .signal_extraction_types import RunContext
9+
from .utils import RetryWithBackoff
910

1011

1112
SignalProcOutcome = Union[AutorevertPattern, RestartCommits, Ineligible]
@@ -180,7 +181,12 @@ def insert_state(
180181
params or "",
181182
]
182183
]
183-
CHCliFactory().client.insert(
184-
table="autorevert_state", data=data, column_names=cols, database="misc"
185-
)
184+
for attempt in RetryWithBackoff():
185+
with attempt:
186+
CHCliFactory().client.insert(
187+
table="autorevert_state",
188+
data=data,
189+
column_names=cols,
190+
database="misc",
191+
)
186192
return state_json

0 commit comments

Comments
 (0)